Update llvm to trunk r256633.

author: dim <dim@FreeBSD.org> 2015-12-30 13:13:10 +0000
committer: dim <dim@FreeBSD.org> 2015-12-30 13:13:10 +0000
commit: 9b5bf5c4f53d65d6a48722d7410ed7cb15f5ba3a (patch)
tree: b466a4817f79516eb1df8eae92bccf62ecc84003 /contrib/llvm/lib/Target/AMDGPU
parent: f09a28d1de99fda4f5517fb12670fc36552f4927 (diff)
parent: e194cd6d03d91631334d9d5e55b506036f423cc8 (diff)
download: FreeBSD-src-9b5bf5c4f53d65d6a48722d7410ed7cb15f5ba3a.zip
FreeBSD-src-9b5bf5c4f53d65d6a48722d7410ed7cb15f5ba3a.tar.gz
89 files changed, 6128 insertions, 2660 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
index 0a05d25..8c3cb56 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -44,15 +44,21 @@ FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
 FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
 FunctionPass *createSIFixControlFlowLiveIntervalsPass();
-FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
+FunctionPass *createSIFixSGPRCopiesPass();
 FunctionPass *createSIFixSGPRLiveRangesPass();
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 FunctionPass *createSIInsertWaits(TargetMachine &tm);
-FunctionPass *createSIPrepareScratchRegs();
+
+ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
+void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
+extern char &AMDGPUAnnotateKernelFeaturesID;
 
 void initializeSIFoldOperandsPass(PassRegistry &);
 extern char &SIFoldOperandsID;
 
+void initializeSIFixSGPRCopiesPass(PassRegistry &);
+extern char &SIFixSGPRCopiesID;
+
 void initializeSILowerI1CopiesPass(PassRegistry &);
 extern char &SILowerI1CopiesID;
 
@@ -64,6 +70,8 @@ FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
 ModulePass *createAMDGPUAlwaysInlinePass();
+ModulePass *createAMDGPUOpenCLImageTypeLoweringPass();
+FunctionPass *createAMDGPUAnnotateUniformValues();
 
 void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&);
 extern char &SIFixControlFlowLiveIntervalsID;
@@ -71,6 +79,8 @@ extern char &SIFixControlFlowLiveIntervalsID;
 void initializeSIFixSGPRLiveRangesPass(PassRegistry&);
 extern char &SIFixSGPRLiveRangesID;
 
+void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&);
+extern char &AMDGPUAnnotateUniformValuesPassID;
 
 extern Target TheAMDGPUTarget;
 extern Target TheGCNTarget;
@@ -85,8 +95,6 @@ enum TargetIndex {
 };
 }
 
-#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel"
-
 } // End namespace llvm
 
 namespace ShaderType {
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
index 68b5050..d4af8d2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -108,6 +108,11 @@ def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <"unsafe-ds-offset-fol
         "true",
         "Force using DS instruction immediate offsets on SI">;
 
+def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
+        "FlatForGlobal",
+        "true",
+        "Force to generate flat instruction for global">;
+
 def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
         "FlatAddressSpace",
         "true",
@@ -272,9 +277,14 @@ def isSICI : Predicate<
   "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
 >, AssemblerPredicate<"FeatureGCN1Encoding">;
 
+def isVI : Predicate <
+  "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
+  AssemblerPredicate<"FeatureGCN3Encoding">;
+
 class PredicateControl {
   Predicate SubtargetPredicate;
   Predicate SIAssemblerPredicate = isSICI;
+  Predicate VIAssemblerPredicate = isVI;
   list<Predicate> AssemblerPredicates = [];
   Predicate AssemblerPredicate = TruePredicate;
   list<Predicate> OtherPredicates = [];
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
new file mode 100644
index 0000000..3781839
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -0,0 +1,126 @@
+//===-- AMDGPUAnnotateKernelFeaturesPass.cpp ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass adds target attributes to functions which use intrinsics
+/// which will impact calling convention lowering.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+
+#define DEBUG_TYPE "amdgpu-annotate-kernel-features"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAnnotateKernelFeatures : public ModulePass {
+private:
+  void addAttrToCallers(Function *Intrin, StringRef AttrName);
+  bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>);
+
+public:
+  static char ID;
+
+  AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { }
+  bool runOnModule(Module &M) override;
+  const char *getPassName() const override {
+    return "AMDGPU Annotate Kernel Features";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    ModulePass::getAnalysisUsage(AU);
+  }
+};
+
+}
+
+char AMDGPUAnnotateKernelFeatures::ID = 0;
+
+char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
+
+
+INITIALIZE_PASS_BEGIN(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
+                      "Add AMDGPU function attributes", false, false)
+INITIALIZE_PASS_END(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
+                    "Add AMDGPU function attributes", false, false)
+
+
+void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin,
+                                                    StringRef AttrName) {
+  SmallPtrSet<Function *, 4> SeenFuncs;
+
+  for (User *U : Intrin->users()) {
+    // CallInst is the only valid user for an intrinsic.
+    CallInst *CI = cast<CallInst>(U);
+
+    Function *CallingFunction = CI->getParent()->getParent();
+    if (SeenFuncs.insert(CallingFunction).second)
+      CallingFunction->addFnAttr(AttrName);
+  }
+}
+
+bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics(
+  Module &M,
+  ArrayRef<StringRef[2]> IntrinsicToAttr) {
+  bool Changed = false;
+
+  for (const StringRef *Arr  : IntrinsicToAttr) {
+    if (Function *Fn = M.getFunction(Arr[0])) {
+      addAttrToCallers(Fn, Arr[1]);
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
+  Triple TT(M.getTargetTriple());
+
+  static const StringRef IntrinsicToAttr[][2] = {
+    // .x omitted
+    { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" },
+    { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" },
+
+    // .x omitted
+    { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" },
+    { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" }
+
+  };
+
+  static const StringRef HSAIntrinsicToAttr[][2] = {
+    { "llvm.r600.read.local.size.x", "amdgpu-dispatch-ptr" },
+    { "llvm.r600.read.local.size.y", "amdgpu-dispatch-ptr" },
+    { "llvm.r600.read.local.size.z", "amdgpu-dispatch-ptr" },
+
+    { "llvm.r600.read.global.size.x", "amdgpu-dispatch-ptr" },
+    { "llvm.r600.read.global.size.y", "amdgpu-dispatch-ptr" },
+    { "llvm.r600.read.global.size.z", "amdgpu-dispatch-ptr" },
+    { "llvm.amdgcn.dispatch.ptr",     "amdgpu-dispatch-ptr" }
+  };
+
+  // TODO: Intrinsics that require queue ptr.
+
+  // We do not need to note the x workitem or workgroup id because they are
+  // always initialized.
+
+  bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr);
+  if (TT.getOS() == Triple::AMDHSA)
+    Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr);
+
+  return Changed;
+}
+
+ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
+  return new AMDGPUAnnotateKernelFeatures();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
new file mode 100644
index 0000000..dfddc34
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -0,0 +1,84 @@
+//===-- AMDGPUAnnotateUniformValues.cpp - ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass adds amdgpu.uniform metadata to IR values so this information
+/// can be used during instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-annotate-uniform"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAnnotateUniformValues : public FunctionPass,
+                       public InstVisitor<AMDGPUAnnotateUniformValues> {
+  DivergenceAnalysis *DA;
+
+public:
+  static char ID;
+  AMDGPUAnnotateUniformValues() :
+    FunctionPass(ID) { }
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+  const char *getPassName() const override { return "AMDGPU Annotate Uniform Values"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DivergenceAnalysis>();
+    AU.setPreservesAll();
+ }
+
+  void visitLoadInst(LoadInst &I);
+
+};
+
+} // End anonymous namespace
+
+INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
+                      "Add AMDGPU uniform metadata", false, false)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
+                    "Add AMDGPU uniform metadata", false, false)
+
+char AMDGPUAnnotateUniformValues::ID = 0;
+
+void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
+  Value *Ptr = I.getPointerOperand();
+  if (!DA->isUniform(Ptr))
+    return;
+
+  if (Instruction *PtrI = dyn_cast<Instruction>(Ptr))
+    PtrI->setMetadata("amdgpu.uniform", MDNode::get(I.getContext(), {}));
+
+}
+
+bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
+  return false;
+}
+
+bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
+  DA = &getAnalysis<DivergenceAnalysis>();
+  visit(F);
+
+  return true;
+}
+
+FunctionPass *
+llvm::createAMDGPUAnnotateUniformValues() {
+  return new AMDGPUAnnotateUniformValues();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 0a5309b..ba71dc0 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -100,14 +100,63 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
   }
 }
 
-void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
-
-  // This label is used to mark the end of the .text section.
-  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
-  OutStreamer->SwitchSection(TLOF.getTextSection());
-  MCSymbol *EndOfTextLabel =
-      OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
-  OutStreamer->EmitLabel(EndOfTextLabel);
+void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
+  const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
+  if (MFI->isKernel() && STM.isAmdHsaOS()) {
+    AMDGPUTargetStreamer *TS =
+        static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+    TS->EmitAMDGPUSymbolType(CurrentFnSym->getName(),
+                             ELF::STT_AMDGPU_HSA_KERNEL);
+  }
+
+  AsmPrinter::EmitFunctionEntryLabel();
+}
+
+static bool isModuleLinkage(const GlobalValue *GV) {
+  switch (GV->getLinkage()) {
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::CommonLinkage:
+   return true;
+  case GlobalValue::ExternalLinkage:
+   return false;
+  default: llvm_unreachable("unknown linkage type");
+  }
+}
+
+void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+
+  if (TM.getTargetTriple().getOS() != Triple::AMDHSA) {
+    AsmPrinter::EmitGlobalVariable(GV);
+    return;
+  }
+
+  if (GV->isDeclaration() || GV->getLinkage() == GlobalValue::PrivateLinkage) {
+    AsmPrinter::EmitGlobalVariable(GV);
+    return;
+  }
+
+  // Group segment variables aren't emitted in HSA.
+  if (AMDGPU::isGroupSegment(GV))
+    return;
+
+  AMDGPUTargetStreamer *TS =
+      static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+  if (isModuleLinkage(GV)) {
+    TS->EmitAMDGPUHsaModuleScopeGlobal(GV->getName());
+  } else {
+    TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName());
+  }
+
+  const DataLayout &DL = getDataLayout();
+  OutStreamer->PushSection();
+  OutStreamer->SwitchSection(
+      getObjFileLowering().SectionForGlobal(GV, *Mang, TM));
+  MCSymbol *GVSym = getSymbol(GV);
+  const Constant *C = GV->getInitializer();
+  OutStreamer->EmitLabel(GVSym);
+  EmitGlobalConstant(DL, C);
+  OutStreamer->PopSection();
 }
 
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
@@ -125,8 +174,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
   SIProgramInfo KernelInfo;
   if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    getSIProgramInfo(KernelInfo, MF);
     if (!STM.isAmdHsaOS()) {
-      getSIProgramInfo(KernelInfo, MF);
       EmitProgramInfoSI(MF, KernelInfo);
     }
     // Emit directives
@@ -165,6 +214,23 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                                   false);
       OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
                                   false);
+
+      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
+                                  Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
+                                  false);
+      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
+                                  Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)),
+                                  false);
+      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
+                                  Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)),
+                                  false);
+      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
+                                  Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)),
+                                  false);
+      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
+                                  Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)),
+                                  false);
+
     } else {
       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
       OutStreamer->emitRawComment(
@@ -278,27 +344,30 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
         unsigned width = 0;
         bool isSGPR = false;
 
-        if (!MO.isReg()) {
+        if (!MO.isReg())
           continue;
-        }
+
         unsigned reg = MO.getReg();
-        if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO ||
-	    reg == AMDGPU::VCC_HI) {
+        switch (reg) {
+        case AMDGPU::EXEC:
+        case AMDGPU::SCC:
+        case AMDGPU::M0:
+          continue;
+
+        case AMDGPU::VCC:
+        case AMDGPU::VCC_LO:
+        case AMDGPU::VCC_HI:
           VCCUsed = true;
           continue;
-        } else if (reg == AMDGPU::FLAT_SCR ||
-                   reg == AMDGPU::FLAT_SCR_LO ||
-                   reg == AMDGPU::FLAT_SCR_HI) {
+
+        case AMDGPU::FLAT_SCR:
+        case AMDGPU::FLAT_SCR_LO:
+        case AMDGPU::FLAT_SCR_HI:
           FlatUsed = true;
           continue;
-        }
 
-        switch (reg) {
-        default: break;
-        case AMDGPU::SCC:
-        case AMDGPU::EXEC:
-        case AMDGPU::M0:
-          continue;
+        default:
+          break;
         }
 
         if (AMDGPU::SReg_32RegClass.contains(reg)) {
@@ -348,11 +417,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     }
   }
 
-  if (VCCUsed)
+  if (VCCUsed || FlatUsed)
     MaxSGPR += 2;
 
-  if (FlatUsed)
+  if (FlatUsed) {
     MaxSGPR += 2;
+    // 2 additional for VI+.
+    if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+      MaxSGPR += 2;
+  }
 
   // We found the maximum register index. They start at 0, so add one to get the
   // number of registers.
@@ -368,6 +441,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
   }
 
+  if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
+    LLVMContext &Ctx = MF.getFunction()->getContext();
+    Ctx.emitError("too many user SGPRs used");
+  }
+
   ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
   ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
@@ -419,18 +497,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
       S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
       S_00B848_PRIV(ProgInfo.Priv) |
       S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
-      S_00B848_IEEE_MODE(ProgInfo.DebugMode) |
+      S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
       S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
 
+  // 0 = X, 1 = XY, 2 = XYZ
+  unsigned TIDIGCompCnt = 0;
+  if (MFI->hasWorkItemIDZ())
+    TIDIGCompCnt = 2;
+  else if (MFI->hasWorkItemIDY())
+    TIDIGCompCnt = 1;
+
   ProgInfo.ComputePGMRSrc2 =
       S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
-      S_00B84C_USER_SGPR(MFI->NumUserSGPRs) |
-      S_00B84C_TGID_X_EN(1) |
-      S_00B84C_TGID_Y_EN(1) |
-      S_00B84C_TGID_Z_EN(1) |
-      S_00B84C_TG_SIZE_EN(1) |
-      S_00B84C_TIDIG_COMP_CNT(2) |
-      S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks);
+      S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
+      S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
+      S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
+      S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
+      S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
+      S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
+      S_00B84C_EXCP_EN_MSB(0) |
+      S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) |
+      S_00B84C_EXCP_EN(0);
 }
 
 static unsigned getRsrcReg(unsigned ShaderType) {
@@ -491,14 +578,53 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
   header.compute_pgm_resource_registers =
       KernelInfo.ComputePGMRSrc1 |
       (KernelInfo.ComputePGMRSrc2 << 32);
-  header.code_properties =
-      AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR |
-      AMD_CODE_PROPERTY_IS_PTR64;
+  header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
+
+  if (MFI->hasPrivateSegmentBuffer()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
+  }
+
+  if (MFI->hasDispatchPtr())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+
+  if (MFI->hasQueuePtr())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
+
+  if (MFI->hasKernargSegmentPtr())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
+
+  if (MFI->hasDispatchID())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
+
+  if (MFI->hasFlatScratchInit())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+
+  // TODO: Private segment size
+
+  if (MFI->hasGridWorkgroupCountX()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
+  }
+
+  if (MFI->hasGridWorkgroupCountY()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
+  }
+
+  if (MFI->hasGridWorkgroupCountZ()) {
+    header.code_properties |=
+      AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
+  }
+
+  if (MFI->hasDispatchPtr())
+    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 
   header.kernarg_segment_byte_size = MFI->ABIArgOffset;
   header.wavefront_sgpr_count = KernelInfo.NumSGPR;
   header.workitem_vgpr_count = KernelInfo.NumVGPR;
-
+  header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
+  header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
 
   AMDGPUTargetStreamer *TS =
       static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 345af9b..817cbfc 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -99,7 +99,9 @@ public:
 
   void EmitFunctionBodyStart() override;
 
-  void EmitEndOfAsmFile(Module &M) override;
+  void EmitFunctionEntryLabel() override;
+
+  void EmitGlobalVariable(const GlobalVariable *GV) override;
 
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        unsigned AsmVariant, const char *ExtraCode,
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp
new file mode 100644
index 0000000..2f6b302
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp
@@ -0,0 +1,26 @@
+//===-- AMDGPUDiagnosticInfoUnsupported.cpp -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUDiagnosticInfoUnsupported.h"
+
+using namespace llvm;
+
+DiagnosticInfoUnsupported::DiagnosticInfoUnsupported(
+  const Function &Fn,
+  const Twine &Desc,
+  DiagnosticSeverity Severity)
+  : DiagnosticInfo(getKindID(), Severity),
+    Description(Desc),
+    Fn(Fn) { }
+
+int DiagnosticInfoUnsupported::KindID = 0;
+
+void DiagnosticInfoUnsupported::print(DiagnosticPrinter &DP) const {
+  DP << "unsupported " << getDescription() << " in " << Fn.getName();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h
new file mode 100644
index 0000000..0fd37e1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h
@@ -0,0 +1,48 @@
+//===-- AMDGPUDiagnosticInfoUnsupported.h - Error reporting -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H
+
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+
+namespace llvm {
+
+/// Diagnostic information for unimplemented or unsupported feature reporting.
+class DiagnosticInfoUnsupported : public DiagnosticInfo {
+private:
+  const Twine &Description;
+  const Function &Fn;
+
+  static int KindID;
+
+  static int getKindID() {
+    if (KindID == 0)
+      KindID = llvm::getNextAvailablePluginDiagnosticKind();
+    return KindID;
+  }
+
+public:
+  DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
+                            DiagnosticSeverity Severity = DS_Error);
+
+  const Function &getFunction() const { return Fn; }
+  const Twine &getDescription() const { return Description; }
+
+  void print(DiagnosticPrinter &DP) const override;
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == getKindID();
+  }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
index 8175786..4d84d28 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
@@ -71,9 +71,15 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
 }
 
 /// \returns The number of registers allocated for \p FI.
-int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
-                                         int FI) const {
+int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                                int FI,
+                                                unsigned &FrameReg) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+
+  // Fill in FrameReg output argument.
+  FrameReg = RI->getFrameRegister(MF);
+
   // Start the offset at 2 so we don't overwrite work group information.
   // XXX: We should only do this when the shader actually uses this
   // information.
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 9f31be1..257a3da 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -8,14 +8,12 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// \brief Interface to describe a layout of a stack frame on a AMDIL target
-/// machine.
+/// \brief Interface to describe a layout of a stack frame on an AMDGPU target.
 //
 //===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
-#define LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
 
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
@@ -34,7 +32,8 @@ public:
   /// \returns The number of 32-bit sub-registers that are used when storing
   /// values to the stack.
   unsigned getStackWidth(const MachineFunction &MF) const;
-  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
   const SpillSlot *
     getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 64c54cc..b33040b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -11,6 +11,8 @@
 /// \brief Defines an instruction selector for the AMDGPU target.
 //
 //===----------------------------------------------------------------------===//
+
+#include "AMDGPUDiagnosticInfoUnsupported.h"
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPUISelLowering.h" // For AMDGPUISD
 #include "AMDGPURegisterInfo.h"
@@ -20,9 +22,9 @@
 #include "SIISelLowering.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/Function.h"
@@ -40,12 +42,14 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
   // make the right decision when generating code for different targets.
   const AMDGPUSubtarget *Subtarget;
+
 public:
   AMDGPUDAGToDAGISel(TargetMachine &TM);
   virtual ~AMDGPUDAGToDAGISel();
   bool runOnMachineFunction(MachineFunction &MF) override;
   SDNode *Select(SDNode *N) override;
   const char *getPassName() const override;
+  void PreprocessISelDAG() override;
   void PostprocessISelDAG() override;
 
 private:
@@ -91,7 +95,7 @@ private:
   bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
   bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
                                  SDValue &Offset1) const;
-  void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+  bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
                    SDValue &SOffset, SDValue &Offset, SDValue &Offen,
                    SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
                    SDValue &TFE) const;
@@ -108,6 +112,16 @@ private:
                          SDValue &TFE) const;
   bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
                          SDValue &Offset, SDValue &GLC) const;
+  bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
+                        bool &Imm) const;
+  bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
+                  bool &Imm) const;
+  bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+  bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+  bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+  bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
+  bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
+  bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
   SDNode *SelectAddrSpaceCast(SDNode *N);
   bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
@@ -273,6 +287,23 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
   return N;
 }
 
+static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
+  switch (NumVectorElts) {
+  case 1:
+    return AMDGPU::SReg_32RegClassID;
+  case 2:
+    return AMDGPU::SReg_64RegClassID;
+  case 4:
+    return AMDGPU::SReg_128RegClassID;
+  case 8:
+    return AMDGPU::SReg_256RegClassID;
+  case 16:
+    return AMDGPU::SReg_512RegClassID;
+  }
+
+  llvm_unreachable("invalid vector size");
+}
+
 SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   unsigned int Opc = N->getOpcode();
   if (N->isMachineOpcode()) {
@@ -306,38 +337,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     EVT EltVT = VT.getVectorElementType();
     assert(EltVT.bitsEq(MVT::i32));
     if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-      bool UseVReg = true;
-      for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
-                                                    U != E; ++U) {
-        if (!U->isMachineOpcode()) {
-          continue;
-        }
-        const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
-        if (!RC) {
-          continue;
-        }
-        if (static_cast<const SIRegisterInfo *>(TRI)->isSGPRClass(RC)) {
-          UseVReg = false;
-        }
-      }
-      switch(NumVectorElts) {
-      case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID :
-                                     AMDGPU::SReg_32RegClassID;
-        break;
-      case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID :
-                                     AMDGPU::SReg_64RegClassID;
-        break;
-      case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID :
-                                     AMDGPU::SReg_128RegClassID;
-        break;
-      case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID :
-                                     AMDGPU::SReg_256RegClassID;
-        break;
-      case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID :
-                                      AMDGPU::SReg_512RegClassID;
-        break;
-      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
-      }
+      RegClassID = selectSGPRVectorRegClassID(NumVectorElts);
     } else {
       // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
       // that adds a 128 bits reg copy when going through TwoAddressInstructions
@@ -455,98 +455,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
                                   N->getValueType(0), Ops);
   }
-
-  case ISD::LOAD: {
-    LoadSDNode *LD = cast<LoadSDNode>(N);
-    SDLoc SL(N);
-    EVT VT = N->getValueType(0);
-
-    if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) {
-      N = glueCopyToM0(N);
-      break;
-    }
-
-    // To simplify the TableGen patters, we replace all i64 loads with
-    // v2i32 loads.  Alternatively, we could promote i64 loads to v2i32
-    // during DAG legalization, however, so places (ExpandUnalignedLoad)
-    // in the DAG legalizer assume that if i64 is legal, so doing this
-    // promotion early can cause problems.
-
-    SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(),
-                                      LD->getBasePtr(), LD->getMemOperand());
-    SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL,
-                                      MVT::i64, NewLoad);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1));
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast);
-    SDNode *Load = glueCopyToM0(NewLoad.getNode());
-    SelectCode(Load);
-    N = BitCast.getNode();
-    break;
-  }
-
+  case ISD::LOAD:
   case ISD::STORE: {
-    // Handle i64 stores here for the same reason mentioned above for loads.
-    StoreSDNode *ST = cast<StoreSDNode>(N);
-    SDValue Value = ST->getValue();
-    if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) {
-
-      SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
-                                        MVT::v2i32, Value);
-      SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue,
-                                          ST->getBasePtr(), ST->getMemOperand());
-
-      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore);
-
-      if (NewValue.getOpcode() == ISD::BITCAST) {
-        Select(NewStore.getNode());
-        return SelectCode(NewValue.getNode());
-      }
-
-      // getNode() may fold the bitcast if its input was another bitcast.  If that
-      // happens we should only select the new store.
-      N = NewStore.getNode();
-    }
-
     N = glueCopyToM0(N);
     break;
   }
 
-  case AMDGPUISD::REGISTER_LOAD: {
-    if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
-      break;
-    SDValue Addr, Offset;
-
-    SDLoc DL(N);
-    SelectADDRIndirect(N->getOperand(1), Addr, Offset);
-    const SDValue Ops[] = {
-      Addr,
-      Offset,
-      CurDAG->getTargetConstant(0, DL, MVT::i32),
-      N->getOperand(0),
-    };
-    return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL,
-                                  CurDAG->getVTList(MVT::i32, MVT::i64,
-                                                    MVT::Other),
-                                  Ops);
-  }
-  case AMDGPUISD::REGISTER_STORE: {
-    if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
-      break;
-    SDValue Addr, Offset;
-    SelectADDRIndirect(N->getOperand(2), Addr, Offset);
-    SDLoc DL(N);
-    const SDValue Ops[] = {
-      N->getOperand(1),
-      Addr,
-      Offset,
-      CurDAG->getTargetConstant(0, DL, MVT::i32),
-      N->getOperand(0),
-    };
-    return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL,
-                                        CurDAG->getVTList(MVT::Other),
-                                        Ops);
-  }
-
   case AMDGPUISD::BFE_I32:
   case AMDGPUISD::BFE_U32: {
     if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
@@ -575,7 +489,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
 
     return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N),
                     N->getOperand(0), OffsetVal, WidthVal);
-
   }
   case AMDGPUISD::DIV_SCALE: {
     return SelectDIV_SCALE(N);
@@ -601,7 +514,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   return SelectCode(N);
 }
 
-
 bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) {
   assert(AS != 0 && "Use checkPrivateAddress instead.");
   if (!Ptr)
@@ -681,7 +593,7 @@ bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
   if (checkPrivateAddress(N->getMemOperand())) {
     if (MMO) {
       const PseudoSourceValue *PSV = MMO->getPseudoValue();
-      if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
+      if (PSV && PSV->isConstantPool()) {
         return true;
       }
     }
@@ -847,7 +759,8 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
   unsigned Opc
     = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
 
-  // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
+  // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
+  // omod
   SDValue Ops[8];
 
   SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
@@ -883,15 +796,39 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
       Offset = N1;
       return true;
     }
-  }
+  } else if (Addr.getOpcode() == ISD::SUB) {
+    // sub C, x -> add (sub 0, x), C
+    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
+      int64_t ByteOffset = C->getSExtValue();
+      if (isUInt<16>(ByteOffset)) {
+        SDLoc DL(Addr);
+        SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+        // XXX - This is kind of hacky. Create a dummy sub node so we can check
+        // the known bits in isDSOffsetLegal. We need to emit the selected node
+        // here, so this is thrown away.
+        SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
+                                      Zero, Addr.getOperand(1));
+
+        if (isDSOffsetLegal(Sub, ByteOffset, 16)) {
+          MachineSDNode *MachineSub
+            = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32,
+                                     Zero, Addr.getOperand(1));
+
+          Base = SDValue(MachineSub, 0);
+          Offset = Addr.getOperand(0);
+          return true;
+        }
+      }
+    }
+  } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+    // If we have a constant address, prefer to put the constant into the
+    // offset. This can save moves to load the constant address since multiple
+    // operations can share the zero base address register, and enables merging
+    // into read2 / write2 instructions.
 
-  SDLoc DL(Addr);
+    SDLoc DL(Addr);
 
-  // If we have a constant address, prefer to put the constant into the
-  // offset. This can save moves to load the constant address since multiple
-  // operations can share the zero base address register, and enables merging
-  // into read2 / write2 instructions.
-  if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
     if (isUInt<16>(CAddr->getZExtValue())) {
       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
       MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
@@ -904,10 +841,11 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
 
   // default case
   Base = Addr;
-  Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
   return true;
 }
 
+// TODO: If offset is too big, put low 16-bit into offset.
 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
                                                    SDValue &Offset0,
                                                    SDValue &Offset1) const {
@@ -926,9 +864,35 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
       Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
       return true;
     }
-  }
-
-  if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+  } else if (Addr.getOpcode() == ISD::SUB) {
+    // sub C, x -> add (sub 0, x), C
+    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
+      unsigned DWordOffset0 = C->getZExtValue() / 4;
+      unsigned DWordOffset1 = DWordOffset0 + 1;
+
+      if (isUInt<8>(DWordOffset0)) {
+        SDLoc DL(Addr);
+        SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+        // XXX - This is kind of hacky. Create a dummy sub node so we can check
+        // the known bits in isDSOffsetLegal. We need to emit the selected node
+        // here, so this is thrown away.
+        SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
+                                      Zero, Addr.getOperand(1));
+
+        if (isDSOffsetLegal(Sub, DWordOffset1, 8)) {
+          MachineSDNode *MachineSub
+            = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32,
+                                     Zero, Addr.getOperand(1));
+
+          Base = SDValue(MachineSub, 0);
+          Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
+          Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
+          return true;
+        }
+      }
+    }
+  } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
     unsigned DWordOffset0 = CAddr->getZExtValue() / 4;
     unsigned DWordOffset1 = DWordOffset0 + 1;
     assert(4 * DWordOffset0 == CAddr->getZExtValue());
@@ -956,12 +920,16 @@ static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
   return isUInt<12>(Imm->getZExtValue());
 }
 
-void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
+bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
                                      SDValue &VAddr, SDValue &SOffset,
                                      SDValue &Offset, SDValue &Offen,
                                      SDValue &Idxen, SDValue &Addr64,
                                      SDValue &GLC, SDValue &SLC,
                                      SDValue &TFE) const {
+  // Subtarget prefers to use flat instruction
+  if (Subtarget->useFlatForGlobal())
+    return false;
+
   SDLoc DL(Addr);
 
   GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
@@ -994,14 +962,14 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
 
     if (isLegalMUBUFImmOffset(C1)) {
         Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
-        return;
+        return true;
     } else if (isUInt<32>(C1->getZExtValue())) {
       // Illegal offset, store it in soffset.
       Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
       SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
                    CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
                         0);
-      return;
+      return true;
     }
   }
 
@@ -1013,7 +981,7 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
     Ptr = N0;
     VAddr = N1;
     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
-    return;
+    return true;
   }
 
   // default case -> offset
@@ -1021,6 +989,7 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
   Ptr = Addr;
   Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
 
+  return true;
 }
 
 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
@@ -1033,8 +1002,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
   if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
     return false;
 
-  SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
-              GLC, SLC, TFE);
+  if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+              GLC, SLC, TFE))
+    return false;
 
   ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
   if (C->getSExtValue()) {
@@ -1052,8 +1022,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
 
 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
                                            SDValue &VAddr, SDValue &SOffset,
-					   SDValue &Offset,
-					   SDValue &SLC) const {
+                                           SDValue &Offset,
+                                           SDValue &SLC) const {
   SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
   SDValue GLC, TFE;
 
@@ -1066,36 +1036,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
 
   SDLoc DL(Addr);
   MachineFunction &MF = CurDAG->getMachineFunction();
-  const SIRegisterInfo *TRI =
-      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SITargetLowering& Lowering =
-    *static_cast<const SITargetLowering*>(getTargetLowering());
-
-  unsigned ScratchOffsetReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
-  Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass,
-                                ScratchOffsetReg, MVT::i32);
-  SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32);
-  SDValue ScratchRsrcDword0 =
-      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0);
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
-  SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32);
-  SDValue ScratchRsrcDword1 =
-      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0);
-
-  const SDValue RsrcOps[] = {
-      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
-      ScratchRsrcDword0,
-      CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
-      ScratchRsrcDword1,
-      CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
-  };
-  SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
-                                              MVT::v2i32, RsrcOps), 0);
-  Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0);
-  SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
-      MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
+  Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
+  SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);
 
   // (add n0, c1)
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
@@ -1126,8 +1070,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
   const SIInstrInfo *TII =
     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
 
-  SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
-              GLC, SLC, TFE);
+  if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+              GLC, SLC, TFE))
+    return false;
 
   if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
       !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
@@ -1153,18 +1098,134 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
   return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
 }
 
+///
+/// \param EncodedOffset This is the immediate value that will be encoded
+///        directly into the instruction.  On SI/CI the \p EncodedOffset
+///        will be in units of dwords and on VI+ it will be units of bytes.
+static bool isLegalSMRDImmOffset(const AMDGPUSubtarget *ST,
+                                 int64_t EncodedOffset) {
+  return ST->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ?
+     isUInt<8>(EncodedOffset) : isUInt<20>(EncodedOffset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
+                                          SDValue &Offset, bool &Imm) const {
+
+  // FIXME: Handle non-constant offsets.
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
+  if (!C)
+    return false;
+
+  SDLoc SL(ByteOffsetNode);
+  AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration();
+  int64_t ByteOffset = C->getSExtValue();
+  int64_t EncodedOffset = Gen < AMDGPUSubtarget::VOLCANIC_ISLANDS ?
+      ByteOffset >> 2 : ByteOffset;
+
+  if (isLegalSMRDImmOffset(Subtarget, EncodedOffset)) {
+    Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
+    Imm = true;
+    return true;
+  }
+
+  if (!isUInt<32>(EncodedOffset) || !isUInt<32>(ByteOffset))
+    return false;
+
+  if (Gen == AMDGPUSubtarget::SEA_ISLANDS && isUInt<32>(EncodedOffset)) {
+    // 32-bit Immediates are supported on Sea Islands.
+    Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
+  } else {
+    SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
+    Offset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32,
+                                            C32Bit), 0);
+  }
+  Imm = false;
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
+                                     SDValue &Offset, bool &Imm) const {
+
+  SDLoc SL(Addr);
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+
+    if (SelectSMRDOffset(N1, Offset, Imm)) {
+      SBase = N0;
+      return true;
+    }
+  }
+  SBase = Addr;
+  Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
+  Imm = true;
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
+                                       SDValue &Offset) const {
+  bool Imm;
+  return SelectSMRD(Addr, SBase, Offset, Imm) && Imm;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
+                                         SDValue &Offset) const {
+
+  if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
+    return false;
+
+  bool Imm;
+  if (!SelectSMRD(Addr, SBase, Offset, Imm))
+    return false;
+
+  return !Imm && isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
+                                        SDValue &Offset) const {
+  bool Imm;
+  return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm &&
+         !isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
+                                             SDValue &Offset) const {
+  bool Imm;
+  return SelectSMRDOffset(Addr, Offset, Imm) && Imm;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
+                                               SDValue &Offset) const {
+  if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
+    return false;
+
+  bool Imm;
+  if (!SelectSMRDOffset(Addr, Offset, Imm))
+    return false;
+
+  return !Imm && isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr,
+                                              SDValue &Offset) const {
+  bool Imm;
+  return SelectSMRDOffset(Addr, Offset, Imm) && !Imm &&
+         !isa<ConstantSDNode>(Offset);
+}
+
 // FIXME: This is incorrect and only enough to be able to compile.
 SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
   AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
   SDLoc DL(N);
 
+  const MachineFunction &MF = CurDAG->getMachineFunction();
+  DiagnosticInfoUnsupported NotImplemented(*MF.getFunction(),
+                                           "addrspacecast not implemented");
+  CurDAG->getContext()->diagnose(NotImplemented);
+
   assert(Subtarget->hasFlatAddressSpace() &&
          "addrspacecast only supported with flat address space!");
 
-  assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
-          ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) &&
-         "Cannot cast address space to / from constant address!");
-
   assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
           ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
          "Can only cast to / from flat address space!");
@@ -1190,7 +1251,6 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
       CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32));
   }
 
-
   if (DestSize > SrcSize) {
     assert(SrcSize == 32 && DestSize == 64);
 
@@ -1371,6 +1431,65 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
   return SelectVOP3Mods(In, Src, SrcMods);
 }
 
+void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
+  bool Modified = false;
+
+  // XXX - Other targets seem to be able to do this without a worklist.
+  SmallVector<LoadSDNode *, 8> LoadsToReplace;
+  SmallVector<StoreSDNode *, 8> StoresToReplace;
+
+  for (SDNode &Node : CurDAG->allnodes()) {
+    if (LoadSDNode *LD = dyn_cast<LoadSDNode>(&Node)) {
+      EVT VT = LD->getValueType(0);
+      if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD)
+        continue;
+
+      // To simplify the TableGen patters, we replace all i64 loads with v2i32
+      // loads.  Alternatively, we could promote i64 loads to v2i32 during DAG
+      // legalization, however, so places (ExpandUnalignedLoad) in the DAG
+      // legalizer assume that if i64 is legal, so doing this promotion early
+      // can cause problems.
+      LoadsToReplace.push_back(LD);
+    } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(&Node)) {
+      // Handle i64 stores here for the same reason mentioned above for loads.
+      SDValue Value = ST->getValue();
+      if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore())
+        continue;
+      StoresToReplace.push_back(ST);
+    }
+  }
+
+  for (LoadSDNode *LD : LoadsToReplace) {
+    SDLoc SL(LD);
+
+    SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SL, LD->getChain(),
+                                      LD->getBasePtr(), LD->getMemOperand());
+    SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL,
+                                      MVT::i64, NewLoad);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 0), BitCast);
+    Modified = true;
+  }
+
+  for (StoreSDNode *ST : StoresToReplace) {
+    SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(ST),
+                                       MVT::v2i32, ST->getValue());
+    const SDValue StoreOps[] = {
+      ST->getChain(),
+      NewValue,
+      ST->getBasePtr(),
+      ST->getOffset()
+    };
+
+    CurDAG->UpdateNodeOperands(ST, StoreOps);
+    Modified = true;
+  }
+
+  // XXX - Is this necessary?
+  if (Modified)
+    CurDAG->RemoveDeadNodes();
+}
+
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   const AMDGPUTargetLowering& Lowering =
     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3a65f3b..222f631 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -15,6 +15,7 @@
 
 #include "AMDGPUISelLowering.h"
 #include "AMDGPU.h"
+#include "AMDGPUDiagnosticInfoUnsupported.h"
 #include "AMDGPUFrameLowering.h"
 #include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPURegisterInfo.h"
@@ -27,50 +28,9 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/DiagnosticPrinter.h"
 
 using namespace llvm;
 
-namespace {
-
-/// Diagnostic information for unimplemented or unsupported feature reporting.
-class DiagnosticInfoUnsupported : public DiagnosticInfo {
-private:
-  const Twine &Description;
-  const Function &Fn;
-
-  static int KindID;
-
-  static int getKindID() {
-    if (KindID == 0)
-      KindID = llvm::getNextAvailablePluginDiagnosticKind();
-    return KindID;
-  }
-
-public:
-  DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
-                          DiagnosticSeverity Severity = DS_Error)
-    : DiagnosticInfo(getKindID(), Severity),
-      Description(Desc),
-      Fn(Fn) { }
-
-  const Function &getFunction() const { return Fn; }
-  const Twine &getDescription() const { return Description; }
-
-  void print(DiagnosticPrinter &DP) const override {
-    DP << "unsupported " << getDescription() << " in " << Fn.getName();
-  }
-
-  static bool classof(const DiagnosticInfo *DI) {
-    return DI->getKind() == getKindID();
-  }
-};
-
-int DiagnosticInfoUnsupported::KindID = 0;
-}
-
-
 static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
                       CCValAssign::LocInfo LocInfo,
                       ISD::ArgFlagsTy ArgFlags, CCState &State) {
@@ -113,6 +73,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::BRIND, MVT::Other, Expand);
 
+  // This is totally unsupported, just custom lower to produce an error.
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+
   // We need to custom lower some of the intrinsics
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
@@ -352,7 +315,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
     setOperationAction(ISD::SDIVREM, VT, Custom);
-    setOperationAction(ISD::UDIVREM, VT, Custom);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
     setOperationAction(ISD::ADDC, VT, Expand);
     setOperationAction(ISD::SUBC, VT, Expand);
     setOperationAction(ISD::ADDE, VT, Expand);
@@ -429,12 +392,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
   setSelectIsExpensive(false);
   PredictableSelectIsExpensive = false;
 
-  // There are no integer divide instructions, and these expand to a pretty
-  // large sequence of instructions.
-  setIntDivIsCheap(false);
-  setPow2SDivIsCheap(false);
   setFsqrtIsCheap(true);
 
+  // We want to find all load dependencies for long chains of stores to enable
+  // merging into very wide vectors. The problem is with vectors with > 4
+  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
+  // vectors are a legal type, even though we have to split the loads
+  // usually. When we can more precisely specify load legality per address
+  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
+  // smarter so that they can figure out what to do in 2 iterations without all
+  // N > 4 stores on the same chain.
+  GatherAllAliasesMaxDepth = 16;
+
   // FIXME: Need to really handle these.
   MaxStoresPerMemcpy  = 4096;
   MaxStoresPerMemmove = 4096;
@@ -534,6 +503,18 @@ bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
   return true;
 }
 
+bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
+  // There are few operations which truly have vector input operands. Any vector
+  // operation is going to involve operations on each component, and a
+  // build_vector will be a copy per element, so it always makes sense to use a
+  // build_vector input in place of the extracted element to avoid a copy into a
+  // super register.
+  //
+  // We should probably only do this if all users are extracts only, but this
+  // should be the common case.
+  return true;
+}
+
 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
   // Truncate is just accessing a subregister.
   return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
@@ -617,6 +598,15 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
   return SDValue();
 }
 
+SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  const Function &Fn = *DAG.getMachineFunction().getFunction();
+
+  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "dynamic alloca");
+  DAG.getContext()->diagnose(NoDynamicAlloca);
+  return SDValue();
+}
+
 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
                                              SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
@@ -643,6 +633,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
   case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
   case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   }
   return Op;
 }
@@ -892,7 +883,9 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
 
   unsigned FrameIndex = FIN->getIndex();
-  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
+  unsigned IgnoredFrameReg;
+  unsigned Offset =
+      TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
                          Op.getValueType());
 }
@@ -1043,9 +1036,6 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                          Op.getOperand(1),
                          Op.getOperand(2));
 
-    case AMDGPUIntrinsic::AMDGPU_brev:
-      return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));
-
   case Intrinsic::AMDGPU_class:
     return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
                        Op.getOperand(1), Op.getOperand(2));
@@ -1057,6 +1047,8 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
     case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.
       return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));
+    case AMDGPUIntrinsic::AMDGPU_brev: // Legacy name
+      return DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(1));
   }
 }
 
@@ -1077,6 +1069,7 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
+  // TODO: Should this propagate fast-math-flags?
   SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
                                 DAG.getConstantFP(1.0f, DL, MVT::f32),
                                 Op.getOperand(1));
@@ -1167,45 +1160,6 @@ SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
   return SDValue();
 }
 
-// FIXME: Remove this when combines added to DAGCombiner.
-SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL,
-                                             EVT VT,
-                                             SDValue LHS,
-                                             SDValue RHS,
-                                             SDValue True,
-                                             SDValue False,
-                                             SDValue CC,
-                                             SelectionDAG &DAG) const {
-  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
-    return SDValue();
-
-  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
-  switch (CCOpcode) {
-  case ISD::SETULE:
-  case ISD::SETULT: {
-    unsigned Opc = (LHS == True) ? ISD::UMIN : ISD::UMAX;
-    return DAG.getNode(Opc, DL, VT, LHS, RHS);
-  }
-  case ISD::SETLE:
-  case ISD::SETLT: {
-    unsigned Opc = (LHS == True) ? ISD::SMIN : ISD::SMAX;
-    return DAG.getNode(Opc, DL, VT, LHS, RHS);
-  }
-  case ISD::SETGT:
-  case ISD::SETGE: {
-    unsigned Opc = (LHS == True) ? ISD::SMAX : ISD::SMIN;
-    return DAG.getNode(Opc, DL, VT, LHS, RHS);
-  }
-  case ISD::SETUGE:
-  case ISD::SETUGT: {
-    unsigned Opc = (LHS == True) ? ISD::UMAX : ISD::UMIN;
-    return DAG.getNode(Opc, DL, VT, LHS, RHS);
-  }
-  default:
-    return SDValue();
-  }
-}
-
 SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op,
                                                   SelectionDAG &DAG) const {
   LoadSDNode *Load = cast<LoadSDNode>(Op);
@@ -1260,7 +1214,8 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
   EVT PtrVT = BasePtr.getValueType();
   EVT MemVT = Load->getMemoryVT();
   SDLoc SL(Op);
-  MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
+
+  const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
 
   EVT LoVT, HiVT;
   EVT LoMemVT, HiMemVT;
@@ -1269,23 +1224,27 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
   std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
+
+  unsigned Size = LoMemVT.getStoreSize();
+  unsigned BaseAlign = Load->getAlignment();
+  unsigned HiAlign = MinAlign(BaseAlign, Size);
+
   SDValue LoLoad
     = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
                      Load->getChain(), BasePtr,
                      SrcValue,
                      LoMemVT, Load->isVolatile(), Load->isNonTemporal(),
-                     Load->isInvariant(), Load->getAlignment());
+                     Load->isInvariant(), BaseAlign);
 
   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
-                              DAG.getConstant(LoMemVT.getStoreSize(), SL,
-                                              PtrVT));
+                              DAG.getConstant(Size, SL, PtrVT));
 
   SDValue HiLoad
     = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT,
                      Load->getChain(), HiPtr,
                      SrcValue.getWithOffset(LoMemVT.getStoreSize()),
                      HiMemVT, Load->isVolatile(), Load->isNonTemporal(),
-                     Load->isInvariant(), Load->getAlignment());
+                     Load->isInvariant(), HiAlign);
 
   SDValue Ops[] = {
     DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
@@ -1415,7 +1374,11 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
                               DAG.getConstant(LoMemVT.getStoreSize(), SL,
                                               PtrVT));
 
-  MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
+  const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
+  unsigned BaseAlign = Store->getAlignment();
+  unsigned Size = LoMemVT.getStoreSize();
+  unsigned HiAlign = MinAlign(BaseAlign, Size);
+
   SDValue LoStore
     = DAG.getTruncStore(Chain, SL, Lo,
                         BasePtr,
@@ -1423,15 +1386,15 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
                         LoMemVT,
                         Store->isNonTemporal(),
                         Store->isVolatile(),
-                        Store->getAlignment());
+                        BaseAlign);
   SDValue HiStore
     = DAG.getTruncStore(Chain, SL, Hi,
                         HiPtr,
-                        SrcValue.getWithOffset(LoMemVT.getStoreSize()),
+                        SrcValue.getWithOffset(Size),
                         HiMemVT,
                         Store->isNonTemporal(),
                         Store->isVolatile(),
-                        Store->getAlignment());
+                        HiAlign);
 
   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
 }
@@ -1529,7 +1492,7 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
        Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
       Store->getValue().getValueType().isVector()) {
-    return ScalarizeVectorStore(Op, DAG);
+    return SplitVectorStore(Op, DAG);
   }
 
   EVT MemVT = Store->getMemoryVT();
@@ -1630,6 +1593,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
   // float fb = (float)ib;
   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
 
+  // TODO: Should this propagate fast-math-flags?
   // float fq = native_divide(fa, fb);
   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
@@ -1940,6 +1904,8 @@ SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
   SDValue X = Op.getOperand(0);
   SDValue Y = Op.getOperand(1);
 
+  // TODO: Should this propagate fast-math-flags?
+
   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
   SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
@@ -1968,6 +1934,7 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
 
   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
+  // TODO: Should this propagate fast-math-flags?
   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
 }
 
@@ -2045,6 +2012,8 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
   SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
 
+  // TODO: Should this propagate fast-math-flags?
+
   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
 
@@ -2074,6 +2043,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const
 
   SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
 
+  // TODO: Should this propagate fast-math-flags?
+
   SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
 
   SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
@@ -2184,6 +2155,7 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
 
   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
+  // TODO: Should this propagate fast-math-flags?
   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
 }
 
@@ -2206,7 +2178,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
 
   SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
                               DAG.getConstant(32, SL, MVT::i32));
-
+  // TODO: Should this propagate fast-math-flags?
   return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
 }
 
@@ -2231,6 +2203,7 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
                            DAG.getConstant(1, DL, MVT::i32));
   SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi);
+  // TODO: Should this propagate fast-math-flags?
   FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
                         DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32
   return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
@@ -2257,7 +2230,7 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
                                  MVT::f64);
   SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
                                  MVT::f64);
-
+  // TODO: Should this propagate fast-math-flags?
   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
 
   SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
@@ -2511,12 +2484,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
 
       if (VT == MVT::f32)
         return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
-
-      // TODO: Implement min / max Evergreen instructions.
-      if (VT == MVT::i32 &&
-          Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-        return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
-      }
     }
 
     break;
@@ -2652,20 +2619,14 @@ bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
     return CFP->isExactlyValue(1.0);
   }
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
-    return C->isAllOnesValue();
-  }
-  return false;
+  return isAllOnesConstant(Op);
 }
 
 bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
     return CFP->getValueAPF().isZero();
   }
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
-    return C->isNullValue();
-  }
-  return false;
+  return isNullConstant(Op);
 }
 
 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
@@ -2738,7 +2699,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BFE_I32)
   NODE_NAME_CASE(BFI)
   NODE_NAME_CASE(BFM)
-  NODE_NAME_CASE(BREV)
   NODE_NAME_CASE(MUL_U24)
   NODE_NAME_CASE(MUL_I24)
   NODE_NAME_CASE(MAD_U24)
@@ -2893,8 +2853,7 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
       return 1;
 
     unsigned SignBits = 32 - Width->getZExtValue() + 1;
-    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-    if (!Offset || !Offset->isNullValue())
+    if (!isNullConstant(Op.getOperand(1)))
       return SignBits;
 
     // TODO: Could probably figure something out with non-0 offsets.
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 478b203..7314cc0 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -138,6 +138,7 @@ public:
   bool storeOfVectorConstantIsCheap(EVT MemVT,
                                     unsigned NumElem,
                                     unsigned AS) const override;
+  bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override;
   bool isCheapToSpeculateCttz() const override;
   bool isCheapToSpeculateCtlz() const override;
 
@@ -149,6 +150,9 @@ public:
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
+  SDValue LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                  SelectionDAG &DAG) const;
+
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
   void ReplaceNodeResults(SDNode * N,
@@ -165,14 +169,6 @@ public:
                                SDValue False,
                                SDValue CC,
                                DAGCombinerInfo &DCI) const;
-  SDValue CombineIMinMax(SDLoc DL,
-                         EVT VT,
-                         SDValue LHS,
-                         SDValue RHS,
-                         SDValue True,
-                         SDValue False,
-                         SDValue CC,
-                         SelectionDAG &DAG) const;
 
   const char* getTargetNodeName(unsigned Opcode) const override;
 
@@ -216,7 +212,7 @@ public:
 
   /// \brief Helper function that returns the byte offset of the given
   /// type of implicit parameter.
-  unsigned getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
+  uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
                                       const ImplicitParameter Param) const;
 };
 
@@ -267,7 +263,6 @@ enum NodeType : unsigned {
   BFE_I32, // Extract range of bits with sign extension to 32-bits.
   BFI, // (src0 & src1) | (~src0 & src2)
   BFM, // Insert a range of bits into a 32-bit word.
-  BREV, // Reverse bits.
   MUL_U24,
   MUL_I24,
   MAD_U24,
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 15a3d54..a266e71 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -164,11 +164,6 @@ MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl(
   // TODO: Implement this function
   return nullptr;
 }
-bool AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
-                                           ArrayRef<unsigned> Ops) const {
-  // TODO: Implement this function
-  return false;
-}
 bool
 AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
                                  unsigned Reg, bool UnfoldLoad,
@@ -312,7 +307,9 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
     return -1;
   }
 
-  Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexOffset(MF, -1);
+  unsigned IgnoredFrameReg;
+  Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexReference(
+      MF, -1, IgnoredFrameReg);
 
   return getIndirectIndexBegin(MF) + Offset;
 }
@@ -367,3 +364,14 @@ int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
 
   return MCOp;
 }
+
+ArrayRef<std::pair<int, const char *>>
+AMDGPUInstrInfo::getSerializableTargetIndices() const {
+  static const std::pair<int, const char *> TargetIndices[] = {
+      {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
+      {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
+  return makeArrayRef(TargetIndices);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 86d3962..53e8b23 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -103,8 +103,6 @@ public:
   /// read or write or -1 if indirect addressing is not used by this program.
   int getIndirectIndexEnd(const MachineFunction &MF) const;
 
-  bool canFoldMemoryOperand(const MachineInstr *MI,
-                            ArrayRef<unsigned> Ops) const override;
   bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
                         unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
                         SmallVectorImpl<MachineInstr *> &NewMIs) const override;
@@ -147,6 +145,9 @@ public:
     return get(pseudoToMCOpcode(Opcode));
   }
 
+  ArrayRef<std::pair<int, const char *>>
+  getSerializableTargetIndices() const override;
+
 //===---------------------------------------------------------------------===//
 // Pure virtual funtions to be implemented by sub-classes.
 //===---------------------------------------------------------------------===//
@@ -195,6 +196,7 @@ public:
 };
 
 namespace AMDGPU {
+  LLVM_READONLY
   int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex);
 }  // End namespace AMDGPU
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index b413897..70e589c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -191,8 +191,6 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
 
-def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>;
-
 // Signed and unsigned 24-bit mulitply.  The highest 8-bits are ignore when
 // performing the mulitply.  The result is a 32-bit value.
 def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 72cab39..11f6139 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -514,7 +514,7 @@ class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul>
 class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx,
                        SubRegIndex sub_reg>
   : Pat<
-  (sub_type (vector_extract vec_type:$src, sub_idx)),
+  (sub_type (extractelt vec_type:$src, sub_idx)),
   (EXTRACT_SUBREG $src, sub_reg)
 >;
 
@@ -522,7 +522,7 @@ class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx,
 class Insert_Element <ValueType elem_type, ValueType vec_type,
                       int sub_idx, SubRegIndex sub_reg>
   : Pat <
-  (vector_insert vec_type:$vec, elem_type:$elem, sub_idx),
+  (insertelt vec_type:$vec, elem_type:$elem, sub_idx),
   (INSERT_SUBREG $vec, $elem, sub_reg)
 >;
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
index ab489cd..1de3546 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
@@ -69,8 +69,8 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_barrier_local  : Intrinsic<[], [], []>;
-  def int_AMDGPU_barrier_global  : Intrinsic<[], [], []>;
+  def int_AMDGPU_barrier_local  : Intrinsic<[], [], [IntrConvergent]>;
+  def int_AMDGPU_barrier_global  : Intrinsic<[], [], [IntrConvergent]>;
 }
 
 // Legacy names for compatibility.
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 2083146..dfc652f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -61,7 +61,7 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
       MCOp = MCOperand::createImm(MO.getImm());
       break;
     case MachineOperand::MO_Register:
-      MCOp = MCOperand::createReg(MO.getReg());
+      MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST));
       break;
     case MachineOperand::MO_MachineBasicBlock:
       MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
@@ -73,13 +73,6 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
       MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx));
       break;
     }
-    case MachineOperand::MO_TargetIndex: {
-      assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START);
-      MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
-      const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
-      MCOp = MCOperand::createExpr(Expr);
-      break;
-    }
     case MachineOperand::MO_ExternalSymbol: {
       MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName()));
       const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
@@ -104,10 +97,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 #endif
   if (MI->isBundle()) {
     const MachineBasicBlock *MBB = MI->getParent();
-    MachineBasicBlock::const_instr_iterator I = MI;
-    ++I;
-    while (I != MBB->end() && I->isInsideBundle()) {
-      EmitInstruction(I);
+    MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
+    while (I != MBB->instr_end() && I->isInsideBundle()) {
+      EmitInstruction(&*I);
       ++I;
     }
   } else {
@@ -136,8 +128,6 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter();
       InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups,
                                     MF->getSubtarget<MCSubtargetInfo>());
-      CodeStream.flush();
-
       HexLines.resize(HexLines.size() + 1);
       std::string &HexLine = HexLines.back();
       raw_string_ostream HexStream(HexLine);
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 21c7da6..5413717 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -1,11 +1,10 @@
 #include "AMDGPUMachineFunction.h"
 #include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 using namespace llvm;
 
-static const char *const ShaderTypeAttribute = "ShaderType";
-
 // Pin the vtable to this file.
 void AMDGPUMachineFunction::anchor() {}
 
@@ -13,13 +12,9 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   MachineFunctionInfo(),
   ShaderType(ShaderType::COMPUTE),
   LDSSize(0),
+  ABIArgOffset(0),
   ScratchSize(0),
   IsKernel(true) {
-  Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute);
 
-  if (A.isStringAttribute()) {
-    StringRef Str = A.getValueAsString();
-    if (Str.getAsInteger(0, ShaderType))
-      llvm_unreachable("Can't parse shader type!");
-  }
+  ShaderType = AMDGPU::getShaderType(*MF.getFunction());
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index f5e4694..46fcee8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -37,6 +37,11 @@ public:
     return ShaderType;
   }
 
+  bool isKernel() const {
+    // FIXME: Assume everything is a kernel until function calls are supported.
+    return true;
+  }
+
   unsigned ScratchSize;
   bool IsKernel;
 };
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
new file mode 100644
index 0000000..554bf1d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
@@ -0,0 +1,373 @@
+//===-- AMDGPUOpenCLImageTypeLoweringPass.cpp -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass resolves calls to OpenCL image attribute, image resource ID and
+/// sampler resource ID getter functions.
+///
+/// Image attributes (size and format) are expected to be passed to the kernel
+/// as kernel arguments immediately following the image argument itself,
+/// therefore this pass adds image size and format arguments to the kernel
+/// functions in the module. The kernel functions with image arguments are
+/// re-created using the new signature. The new arguments are added to the
+/// kernel metadata with kernel_arg_type set to "image_size" or "image_format".
+/// Note: this pass may invalidate pointers to functions.
+///
+/// Resource IDs of read-only images, write-only images and samplers are
+/// defined to be their index among the kernel arguments of the same
+/// type and access qualifier.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+namespace {
+
+StringRef GetImageSizeFunc =         "llvm.OpenCL.image.get.size";
+StringRef GetImageFormatFunc =       "llvm.OpenCL.image.get.format";
+StringRef GetImageResourceIDFunc =   "llvm.OpenCL.image.get.resource.id";
+StringRef GetSamplerResourceIDFunc = "llvm.OpenCL.sampler.get.resource.id";
+
+StringRef ImageSizeArgMDType =   "__llvm_image_size";
+StringRef ImageFormatArgMDType = "__llvm_image_format";
+
+StringRef KernelsMDNodeName = "opencl.kernels";
+StringRef KernelArgMDNodeNames[] = {
+  "kernel_arg_addr_space",
+  "kernel_arg_access_qual",
+  "kernel_arg_type",
+  "kernel_arg_base_type",
+  "kernel_arg_type_qual"};
+const unsigned NumKernelArgMDNodes = 5;
+
+typedef SmallVector<Metadata *, 8> MDVector;
+struct KernelArgMD {
+  MDVector ArgVector[NumKernelArgMDNodes];
+};
+
+} // end anonymous namespace
+
+static inline bool
+IsImageType(StringRef TypeString) {
+  return TypeString == "image2d_t" || TypeString == "image3d_t";
+}
+
+static inline bool
+IsSamplerType(StringRef TypeString) {
+  return TypeString == "sampler_t";
+}
+
+static Function *
+GetFunctionFromMDNode(MDNode *Node) {
+  if (!Node)
+    return nullptr;
+
+  size_t NumOps = Node->getNumOperands();
+  if (NumOps != NumKernelArgMDNodes + 1)
+    return nullptr;
+
+  auto F = mdconst::dyn_extract<Function>(Node->getOperand(0));
+  if (!F)
+    return nullptr;
+
+  // Sanity checks.
+  size_t ExpectNumArgNodeOps = F->arg_size() + 1;
+  for (size_t i = 0; i < NumKernelArgMDNodes; ++i) {
+    MDNode *ArgNode = dyn_cast_or_null<MDNode>(Node->getOperand(i + 1));
+    if (ArgNode->getNumOperands() != ExpectNumArgNodeOps)
+      return nullptr;
+    if (!ArgNode->getOperand(0))
+      return nullptr;
+
+    // FIXME: It should be possible to do image lowering when some metadata
+    // args missing or not in the expected order.
+    MDString *StringNode = dyn_cast<MDString>(ArgNode->getOperand(0));
+    if (!StringNode || StringNode->getString() != KernelArgMDNodeNames[i])
+      return nullptr;
+  }
+
+  return F;
+}
+
+static StringRef
+AccessQualFromMD(MDNode *KernelMDNode, unsigned ArgIdx) {
+  MDNode *ArgAQNode = cast<MDNode>(KernelMDNode->getOperand(2));
+  return cast<MDString>(ArgAQNode->getOperand(ArgIdx + 1))->getString();
+}
+
+static StringRef
+ArgTypeFromMD(MDNode *KernelMDNode, unsigned ArgIdx) {
+  MDNode *ArgTypeNode = cast<MDNode>(KernelMDNode->getOperand(3));
+  return cast<MDString>(ArgTypeNode->getOperand(ArgIdx + 1))->getString();
+}
+
+static MDVector
+GetArgMD(MDNode *KernelMDNode, unsigned OpIdx) {
+  MDVector Res;
+  for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) {
+    MDNode *Node = cast<MDNode>(KernelMDNode->getOperand(i + 1));
+    Res.push_back(Node->getOperand(OpIdx));
+  }
+  return Res;
+}
+
+static void
+PushArgMD(KernelArgMD &MD, const MDVector &V) {
+  assert(V.size() == NumKernelArgMDNodes);
+  for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) {
+    MD.ArgVector[i].push_back(V[i]);
+  }
+}
+
+namespace {
+
+class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass {
+  static char ID;
+
+  LLVMContext *Context;
+  Type *Int32Type;
+  Type *ImageSizeType;
+  Type *ImageFormatType;
+  SmallVector<Instruction *, 4> InstsToErase;
+
+  bool replaceImageUses(Argument &ImageArg, uint32_t ResourceID,
+                        Argument &ImageSizeArg,
+                        Argument &ImageFormatArg) {
+    bool Modified = false;
+
+    for (auto &Use : ImageArg.uses()) {
+      auto Inst = dyn_cast<CallInst>(Use.getUser());
+      if (!Inst) {
+        continue;
+      }
+
+      Function *F = Inst->getCalledFunction();
+      if (!F)
+        continue;
+
+      Value *Replacement = nullptr;
+      StringRef Name = F->getName();
+      if (Name.startswith(GetImageResourceIDFunc)) {
+        Replacement = ConstantInt::get(Int32Type, ResourceID);
+      } else if (Name.startswith(GetImageSizeFunc)) {
+        Replacement = &ImageSizeArg;
+      } else if (Name.startswith(GetImageFormatFunc)) {
+        Replacement = &ImageFormatArg;
+      } else {
+        continue;
+      }
+
+      Inst->replaceAllUsesWith(Replacement);
+      InstsToErase.push_back(Inst);
+      Modified = true;
+    }
+
+    return Modified;
+  }
+
+  bool replaceSamplerUses(Argument &SamplerArg, uint32_t ResourceID) {
+    bool Modified = false;
+
+    for (const auto &Use : SamplerArg.uses()) {
+      auto Inst = dyn_cast<CallInst>(Use.getUser());
+      if (!Inst) {
+        continue;
+      }
+
+      Function *F = Inst->getCalledFunction();
+      if (!F)
+        continue;
+
+      Value *Replacement = nullptr;
+      StringRef Name = F->getName();
+      if (Name == GetSamplerResourceIDFunc) {
+        Replacement = ConstantInt::get(Int32Type, ResourceID);
+      } else {
+        continue;
+      }
+
+      Inst->replaceAllUsesWith(Replacement);
+      InstsToErase.push_back(Inst);
+      Modified = true;
+    }
+
+    return Modified;
+  }
+
+  bool replaceImageAndSamplerUses(Function *F, MDNode *KernelMDNode) {
+    uint32_t NumReadOnlyImageArgs = 0;
+    uint32_t NumWriteOnlyImageArgs = 0;
+    uint32_t NumSamplerArgs = 0;
+
+    bool Modified = false;
+    InstsToErase.clear();
+    for (auto ArgI = F->arg_begin(); ArgI != F->arg_end(); ++ArgI) {
+      Argument &Arg = *ArgI;
+      StringRef Type = ArgTypeFromMD(KernelMDNode, Arg.getArgNo());
+
+      // Handle image types.
+      if (IsImageType(Type)) {
+        StringRef AccessQual = AccessQualFromMD(KernelMDNode, Arg.getArgNo());
+        uint32_t ResourceID;
+        if (AccessQual == "read_only") {
+          ResourceID = NumReadOnlyImageArgs++;
+        } else if (AccessQual == "write_only") {
+          ResourceID = NumWriteOnlyImageArgs++;
+        } else {
+          llvm_unreachable("Wrong image access qualifier.");
+        }
+
+        Argument &SizeArg = *(++ArgI);
+        Argument &FormatArg = *(++ArgI);
+        Modified |= replaceImageUses(Arg, ResourceID, SizeArg, FormatArg);
+
+      // Handle sampler type.
+      } else if (IsSamplerType(Type)) {
+        uint32_t ResourceID = NumSamplerArgs++;
+        Modified |= replaceSamplerUses(Arg, ResourceID);
+      }
+    }
+    for (unsigned i = 0; i < InstsToErase.size(); ++i) {
+      InstsToErase[i]->eraseFromParent();
+    }
+
+    return Modified;
+  }
+
+  std::tuple<Function *, MDNode *>
+  addImplicitArgs(Function *F, MDNode *KernelMDNode) {
+    bool Modified = false;
+
+    FunctionType *FT = F->getFunctionType();
+    SmallVector<Type *, 8> ArgTypes;
+
+    // Metadata operands for new MDNode.
+    KernelArgMD NewArgMDs;
+    PushArgMD(NewArgMDs, GetArgMD(KernelMDNode, 0));
+
+    // Add implicit arguments to the signature.
+    for (unsigned i = 0; i < FT->getNumParams(); ++i) {
+      ArgTypes.push_back(FT->getParamType(i));
+      MDVector ArgMD = GetArgMD(KernelMDNode, i + 1);
+      PushArgMD(NewArgMDs, ArgMD);
+
+      if (!IsImageType(ArgTypeFromMD(KernelMDNode, i)))
+        continue;
+
+      // Add size implicit argument.
+      ArgTypes.push_back(ImageSizeType);
+      ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageSizeArgMDType);
+      PushArgMD(NewArgMDs, ArgMD);
+
+      // Add format implicit argument.
+      ArgTypes.push_back(ImageFormatType);
+      ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageFormatArgMDType);
+      PushArgMD(NewArgMDs, ArgMD);
+
+      Modified = true;
+    }
+    if (!Modified) {
+      return std::make_tuple(nullptr, nullptr);
+    }
+
+    // Create function with new signature and clone the old body into it.
+    auto NewFT = FunctionType::get(FT->getReturnType(), ArgTypes, false);
+    auto NewF = Function::Create(NewFT, F->getLinkage(), F->getName());
+    ValueToValueMapTy VMap;
+    auto NewFArgIt = NewF->arg_begin();
+    for (auto &Arg: F->args()) {
+      auto ArgName = Arg.getName();
+      NewFArgIt->setName(ArgName);
+      VMap[&Arg] = &(*NewFArgIt++);
+      if (IsImageType(ArgTypeFromMD(KernelMDNode, Arg.getArgNo()))) {
+        (NewFArgIt++)->setName(Twine("__size_") + ArgName);
+        (NewFArgIt++)->setName(Twine("__format_") + ArgName);
+      }
+    }
+    SmallVector<ReturnInst*, 8> Returns;
+    CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns);
+
+    // Build new MDNode.
+    SmallVector<llvm::Metadata *, 6> KernelMDArgs;
+    KernelMDArgs.push_back(ConstantAsMetadata::get(NewF));
+    for (unsigned i = 0; i < NumKernelArgMDNodes; ++i)
+      KernelMDArgs.push_back(MDNode::get(*Context, NewArgMDs.ArgVector[i]));
+    MDNode *NewMDNode = MDNode::get(*Context, KernelMDArgs);
+
+    return std::make_tuple(NewF, NewMDNode);
+  }
+
+  bool transformKernels(Module &M) {
+    NamedMDNode *KernelsMDNode = M.getNamedMetadata(KernelsMDNodeName);
+    if (!KernelsMDNode)
+      return false;
+
+    bool Modified = false;
+    for (unsigned i = 0; i < KernelsMDNode->getNumOperands(); ++i) {
+      MDNode *KernelMDNode = KernelsMDNode->getOperand(i);
+      Function *F = GetFunctionFromMDNode(KernelMDNode);
+      if (!F)
+        continue;
+
+      Function *NewF;
+      MDNode *NewMDNode;
+      std::tie(NewF, NewMDNode) = addImplicitArgs(F, KernelMDNode);
+      if (NewF) {
+        // Replace old function and metadata with new ones.
+        F->eraseFromParent();
+        M.getFunctionList().push_back(NewF);
+        M.getOrInsertFunction(NewF->getName(), NewF->getFunctionType(),
+                              NewF->getAttributes());
+        KernelsMDNode->setOperand(i, NewMDNode);
+
+        F = NewF;
+        KernelMDNode = NewMDNode;
+        Modified = true;
+      }
+
+      Modified |= replaceImageAndSamplerUses(F, KernelMDNode);
+    }
+
+    return Modified;
+  }
+
+ public:
+  AMDGPUOpenCLImageTypeLoweringPass() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override {
+    Context = &M.getContext();
+    Int32Type = Type::getInt32Ty(M.getContext());
+    ImageSizeType = ArrayType::get(Int32Type, 3);
+    ImageFormatType = ArrayType::get(Int32Type, 2);
+
+    return transformKernels(M);
+  }
+
+  const char *getPassName() const override {
+    return "AMDGPU OpenCL Image Type Pass";
+  }
+};
+
+char AMDGPUOpenCLImageTypeLoweringPass::ID = 0;
+
+} // end anonymous namespace
+
+ModulePass *llvm::createAMDGPUOpenCLImageTypeLoweringPass() {
+  return new AMDGPUOpenCLImageTypeLoweringPass();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 57b7a73..87d50d5 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -54,7 +54,7 @@ bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
 
 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
 
-  const FunctionType *FTy = F.getFunctionType();
+  FunctionType *FTy = F.getFunctionType();
 
   LocalMemAvailable = ST.getLocalMemorySize();
 
@@ -63,7 +63,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
   // possible these arguments require the entire local memory space, so
   // we cannot use local memory in the pass.
   for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
-    const Type *ParamTy = FTy->getParamType(i);
+    Type *ParamTy = FTy->getParamType(i);
     if (ParamTy->isPointerTy() &&
         ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
       LocalMemAvailable = 0;
@@ -77,7 +77,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
     // Check how much local memory is being used by global objects
     for (Module::global_iterator I = Mod->global_begin(),
                                  E = Mod->global_end(); I != E; ++I) {
-      GlobalVariable *GV = I;
+      GlobalVariable *GV = &*I;
       PointerType *GVTy = GV->getType();
       if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
         continue;
@@ -101,7 +101,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
   return false;
 }
 
-static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
+static VectorType *arrayTypeToVecType(Type *ArrayTy) {
   return VectorType::get(ArrayTy->getArrayElementType(),
                          ArrayTy->getArrayNumElements());
 }
@@ -276,6 +276,9 @@ static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
 }
 
 void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
+  if (!I.isStaticAlloca())
+    return;
+
   IRBuilder<> Builder(&I);
 
   // First try to replace the alloca with a vector
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index cfd800b..0344834 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -37,10 +37,6 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
     assert(!"Unimplemented");  return BitVector();
   }
 
-  virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const {
-    assert(!"Unimplemented"); return nullptr;
-  }
-
   virtual unsigned getHWRegIndex(unsigned Reg) const {
     assert(!"Unimplemented"); return 0;
   }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 5f32a65..44e0c47 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -16,6 +16,7 @@
 #include "R600ISelLowering.h"
 #include "R600InstrInfo.h"
 #include "R600MachineScheduler.h"
+#include "SIFrameLowering.h"
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
@@ -44,6 +45,8 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
   // disable it.
 
   SmallString<256> FullFS("+promote-alloca,+fp64-denormals,");
+  if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
+    FullFS += "+flat-for-global,";
   FullFS += FS;
 
   if (GPU == "" && TT.getArch() == Triple::amdgcn)
@@ -67,26 +70,36 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
       DumpCode(false), R600ALUInst(false), HasVertexCache(false),
       TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false),
       FP64Denormals(false), FP32Denormals(false), FastFMAF32(false),
-      CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true),
-      EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false),
-      EnableUnsafeDSOffsetFolding(false),
+      CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false),
+      EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true),
+      EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false),
       WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
       EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
       GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
       IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false),
-      FrameLowering(TargetFrameLowering::StackGrowsUp,
-                    64 * 16, // Maximum stack alignment (long16)
-                    0),
+      FrameLowering(nullptr),
       InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) {
 
   initializeSubtargetDependencies(TT, GPU, FS);
 
+  const unsigned MaxStackAlign = 64 * 16; // Maximum stack alignment (long16)
+
   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
     InstrInfo.reset(new R600InstrInfo(*this));
     TLInfo.reset(new R600TargetLowering(TM, *this));
+
+    // FIXME: Should have R600 specific FrameLowering
+    FrameLowering.reset(new AMDGPUFrameLowering(
+                          TargetFrameLowering::StackGrowsUp,
+                          MaxStackAlign,
+                          0));
   } else {
     InstrInfo.reset(new SIInstrInfo(*this));
     TLInfo.reset(new SITargetLowering(TM, *this));
+    FrameLowering.reset(new SIFrameLowering(
+                          TargetFrameLowering::StackGrowsUp,
+                          MaxStackAlign,
+                          0));
   }
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 735f01d..9c7bb88 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -1,4 +1,4 @@
-//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====//
+//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,17 +12,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H
-#define LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
+
 #include "AMDGPU.h"
 #include "AMDGPUFrameLowering.h"
 #include "AMDGPUInstrInfo.h"
-#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUISelLowering.h"
 #include "AMDGPUSubtarget.h"
-#include "R600ISelLowering.h"
-#include "AMDKernelCodeT.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
@@ -72,6 +70,7 @@ private:
   bool FastFMAF32;
   bool CaymanISA;
   bool FlatAddressSpace;
+  bool FlatForGlobal;
   bool EnableIRStructurizer;
   bool EnablePromoteAlloca;
   bool EnableIfCvt;
@@ -88,10 +87,10 @@ private:
   bool CIInsts;
   bool FeatureDisable;
   int LDSBankCount;
-  unsigned IsaVersion; 
+  unsigned IsaVersion;
   bool EnableHugeScratchBuffer;
 
-  AMDGPUFrameLowering FrameLowering;
+  std::unique_ptr<AMDGPUFrameLowering> FrameLowering;
   std::unique_ptr<AMDGPUTargetLowering> TLInfo;
   std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
   InstrItineraryData InstrItins;
@@ -104,7 +103,7 @@ public:
                                                    StringRef GPU, StringRef FS);
 
   const AMDGPUFrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
+    return FrameLowering.get();
   }
   const AMDGPUInstrInfo *getInstrInfo() const override {
     return InstrInfo.get();
@@ -161,6 +160,10 @@ public:
     return FlatAddressSpace;
   }
 
+  bool useFlatForGlobal() const {
+    return FlatForGlobal;
+  }
+
   bool hasBFE() const {
     return (getGeneration() >= EVERGREEN);
   }
@@ -305,6 +308,9 @@ public:
     return isAmdHsaOS() ? 0 : 36;
   }
 
+  unsigned getMaxNumUserSGPRs() const {
+    return 16;
+  }
 };
 
 } // End namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 2297b52..22f85b3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUTargetMachine.h"
+#include "AMDGPUTargetObjectFile.h"
 #include "AMDGPU.h"
 #include "AMDGPUTargetTransformInfo.h"
 #include "R600ISelLowering.h"
@@ -41,6 +42,23 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget);
   RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget);
+
+  PassRegistry *PR = PassRegistry::getPassRegistry();
+  initializeSILowerI1CopiesPass(*PR);
+  initializeSIFixSGPRCopiesPass(*PR);
+  initializeSIFoldOperandsPass(*PR);
+  initializeSIFixSGPRLiveRangesPass(*PR);
+  initializeSIFixControlFlowLiveIntervalsPass(*PR);
+  initializeSILoadStoreOptimizerPass(*PR);
+  initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
+  initializeAMDGPUAnnotateUniformValuesPass(*PR);
+}
+
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+  if (TT.getOS() == Triple::AMDHSA)
+    return make_unique<AMDGPUHSATargetObjectFile>();
+
+  return make_unique<AMDGPUTargetObjectFile>();
 }
 
 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
@@ -72,15 +90,13 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
                                          CodeGenOpt::Level OptLevel)
     : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM,
                         OptLevel),
-      TLOF(new TargetLoweringObjectFileELF()), Subtarget(TT, CPU, FS, *this),
+      TLOF(createTLOF(getTargetTriple())), Subtarget(TT, CPU, FS, *this),
       IntrinsicInfo() {
   setRequiresStructuredCFG(true);
   initAsmInfo();
 }
 
-AMDGPUTargetMachine::~AMDGPUTargetMachine() {
-  delete TLOF;
-}
+AMDGPUTargetMachine::~AMDGPUTargetMachine() { }
 
 //===----------------------------------------------------------------------===//
 // R600 Target Machine (R600 -> Cayman)
@@ -110,7 +126,13 @@ namespace {
 class AMDGPUPassConfig : public TargetPassConfig {
 public:
   AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
+    : TargetPassConfig(TM, PM) {
+
+    // Exceptions and StackMaps are not supported, so these passes will never do
+    // anything.
+    disablePass(&StackMapLivenessID);
+    disablePass(&FuncletLayoutID);
+  }
 
   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
     return getTM<AMDGPUTargetMachine>();
@@ -126,8 +148,9 @@ public:
 
   void addIRPasses() override;
   void addCodeGenPrepare() override;
-  virtual bool addPreISel() override;
-  virtual bool addInstSelector() override;
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  bool addGCPasses() override;
 };
 
 class R600PassConfig : public AMDGPUPassConfig {
@@ -147,6 +170,8 @@ public:
     : AMDGPUPassConfig(TM, PM) { }
   bool addPreISel() override;
   bool addInstSelector() override;
+  void addFastRegAlloc(FunctionPass *RegAllocPass) override;
+  void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
   void addPreRegAlloc() override;
   void addPostRegAlloc() override;
   void addPreSched2() override;
@@ -156,7 +181,7 @@ public:
 } // End of anonymous namespace
 
 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](Function &F) {
+  return TargetIRAnalysis([this](const Function &F) {
     return TargetTransformInfo(
         AMDGPUTTIImpl(this, F.getParent()->getDataLayout()));
   });
@@ -172,6 +197,10 @@ void AMDGPUPassConfig::addIRPasses() {
   // functions, then we will generate code for the first function
   // without ever running any passes on the second.
   addPass(createBarrierNoopPass());
+
+  // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
+  addPass(createAMDGPUOpenCLImageTypeLoweringPass());
+
   TargetPassConfig::addIRPasses();
 }
 
@@ -198,6 +227,11 @@ bool AMDGPUPassConfig::addInstSelector() {
   return false;
 }
 
+bool AMDGPUPassConfig::addGCPasses() {
+  // Do nothing. GC is not supported.
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // R600 Pass Setup
 //===----------------------------------------------------------------------===//
@@ -238,16 +272,23 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
 
 bool GCNPassConfig::addPreISel() {
   AMDGPUPassConfig::addPreISel();
+
+  // FIXME: We need to run a pass to propagate the attributes when calls are
+  // supported.
+  addPass(&AMDGPUAnnotateKernelFeaturesID);
+
   addPass(createSinkingPass());
   addPass(createSITypeRewriter());
   addPass(createSIAnnotateControlFlowPass());
+  addPass(createAMDGPUAnnotateUniformValues());
+
   return false;
 }
 
 bool GCNPassConfig::addInstSelector() {
   AMDGPUPassConfig::addInstSelector();
   addPass(createSILowerI1CopiesPass());
-  addPass(createSIFixSGPRCopiesPass(*TM));
+  addPass(&SIFixSGPRCopiesID);
   addPass(createSIFoldOperandsPass());
   return false;
 }
@@ -259,7 +300,6 @@ void GCNPassConfig::addPreRegAlloc() {
   // earlier passes might recompute live intervals.
   // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass
   if (getOptLevel() > CodeGenOpt::None) {
-    initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry());
     insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
   }
 
@@ -269,16 +309,27 @@ void GCNPassConfig::addPreRegAlloc() {
 
     // This should be run after scheduling, but before register allocation. It
     // also need extra copies to the address operand to be eliminated.
-    initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
     insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
     insertPass(&MachineSchedulerID, &RegisterCoalescerID);
   }
   addPass(createSIShrinkInstructionsPass(), false);
-  addPass(createSIFixSGPRLiveRangesPass(), false);
+}
+
+void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+  addPass(&SIFixSGPRLiveRangesID);
+  TargetPassConfig::addFastRegAlloc(RegAllocPass);
+}
+
+void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+  // We want to run this after LiveVariables is computed to avoid computing them
+  // twice.
+  // FIXME: We shouldn't disable the verifier here. r249087 introduced a failure
+  // that needs to be fixed.
+  insertPass(&LiveVariablesID, &SIFixSGPRLiveRangesID, /*VerifyAfter=*/false);
+  TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
 }
 
 void GCNPassConfig::addPostRegAlloc() {
-  addPass(createSIPrepareScratchRegs(), false);
   addPass(createSIShrinkInstructionsPass(), false);
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 14792e3..236e3f8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -32,7 +32,7 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
 private:
 
 protected:
-  TargetLoweringObjectFile *TLOF;
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
   AMDGPUSubtarget Subtarget;
   AMDGPUIntrinsicInfo IntrinsicInfo;
 
@@ -52,7 +52,7 @@ public:
   TargetIRAnalysis getTargetIRAnalysis() override;
 
   TargetLoweringObjectFile *getObjFileLowering() const override {
-    return TLOF;
+    return TLOF.get();
   }
 };
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
new file mode 100644
index 0000000..e050f21
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -0,0 +1,87 @@
+//===-- AMDGPUHSATargetObjectFile.cpp - AMDGPU Object Files ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetObjectFile.h"
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Generic Object File
+//===----------------------------------------------------------------------===//
+
+MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
+                                                          SectionKind Kind,
+                                                          Mangler &Mang,
+                                                const TargetMachine &TM) const {
+  if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GV))
+    return TextSection;
+
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM);
+}
+
+//===----------------------------------------------------------------------===//
+// HSA Object File
+//===----------------------------------------------------------------------===//
+
+
+void AMDGPUHSATargetObjectFile::Initialize(MCContext &Ctx,
+                                           const TargetMachine &TM){
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+
+  TextSection = AMDGPU::getHSATextSection(Ctx);
+
+  DataGlobalAgentSection = AMDGPU::getHSADataGlobalAgentSection(Ctx);
+  DataGlobalProgramSection = AMDGPU::getHSADataGlobalProgramSection(Ctx);
+
+  RodataReadonlyAgentSection = AMDGPU::getHSARodataReadonlyAgentSection(Ctx);
+}
+
+bool AMDGPUHSATargetObjectFile::isAgentAllocationSection(
+    const char *SectionName) const {
+  return cast<MCSectionELF>(DataGlobalAgentSection)
+      ->getSectionName()
+      .equals(SectionName);
+}
+
+bool AMDGPUHSATargetObjectFile::isAgentAllocation(const GlobalValue *GV) const {
+  // Read-only segments can only have agent allocation.
+  return AMDGPU::isReadOnlySegment(GV) ||
+         (AMDGPU::isGlobalSegment(GV) && GV->hasSection() &&
+          isAgentAllocationSection(GV->getSection()));
+}
+
+bool AMDGPUHSATargetObjectFile::isProgramAllocation(
+    const GlobalValue *GV) const {
+  // The default for global segments is program allocation.
+  return AMDGPU::isGlobalSegment(GV) && !isAgentAllocation(GV);
+}
+
+MCSection *AMDGPUHSATargetObjectFile::SelectSectionForGlobal(
+                                        const GlobalValue *GV, SectionKind Kind,
+                                        Mangler &Mang,
+                                        const TargetMachine &TM) const {
+  if (Kind.isText() && !GV->hasComdat())
+    return getTextSection();
+
+  if (AMDGPU::isGlobalSegment(GV)) {
+    if (isAgentAllocation(GV))
+      return DataGlobalAgentSection;
+
+    if (isProgramAllocation(GV))
+      return DataGlobalProgramSection;
+  }
+
+  return AMDGPUTargetObjectFile::SelectSectionForGlobal(GV, Kind, Mang, TM);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
new file mode 100644
index 0000000..921341e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -0,0 +1,51 @@
+//===-- AMDGPUTargetObjectFile.h - AMDGPU  Object Info ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares the AMDGPU-specific subclass of
+/// TargetLoweringObjectFile.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF {
+  public:
+    MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                                      Mangler &Mang,
+                                      const TargetMachine &TM) const override;
+};
+
+class AMDGPUHSATargetObjectFile final : public AMDGPUTargetObjectFile {
+private:
+  MCSection *DataGlobalAgentSection;
+  MCSection *DataGlobalProgramSection;
+  MCSection *RodataReadonlyAgentSection;
+
+  bool isAgentAllocationSection(const char *SectionName) const;
+  bool isAgentAllocation(const GlobalValue *GV) const;
+  bool isProgramAllocation(const GlobalValue *GV) const;
+
+public:
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+  MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                                    Mangler &Mang,
+                                    const TargetMachine &TM) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 6dacc74..54a003d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -74,9 +74,109 @@ unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
 }
 
-unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; }
+unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) {
+  return Vector ? 0 : 32;
+}
 
 unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // Semi-arbitrary large amount.
   return 64;
 }
+
+unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
+  // XXX - For some reason this isn't called for switch.
+  switch (Opcode) {
+  case Instruction::Br:
+  case Instruction::Ret:
+    return 10;
+  default:
+    return BaseT::getCFInstrCost(Opcode);
+  }
+}
+
+int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                      unsigned Index) {
+  switch (Opcode) {
+  case Instruction::ExtractElement:
+    // Dynamic indexing isn't free and is best avoided.
+    return Index == ~0u ? 2 : 0;
+  default:
+    return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+  }
+}
+
+static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
+                                          const IntrinsicInst *I) {
+  switch (I->getIntrinsicID()) {
+  default:
+    return false;
+  case Intrinsic::not_intrinsic:
+    // This means we have an intrinsic that isn't defined in
+    // IntrinsicsAMDGPU.td
+    break;
+
+  case Intrinsic::amdgcn_interp_p1:
+  case Intrinsic::amdgcn_interp_p2:
+  case Intrinsic::amdgcn_mbcnt_hi:
+  case Intrinsic::amdgcn_mbcnt_lo:
+  case Intrinsic::r600_read_tidig_x:
+  case Intrinsic::r600_read_tidig_y:
+  case Intrinsic::r600_read_tidig_z:
+    return true;
+  }
+
+  StringRef Name = I->getCalledFunction()->getName();
+  switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) {
+  default:
+    return false;
+  case AMDGPUIntrinsic::SI_tid:
+  case AMDGPUIntrinsic::SI_fs_interp:
+    return true;
+  }
+}
+
+static bool isArgPassedInSGPR(const Argument *A) {
+  const Function *F = A->getParent();
+  unsigned ShaderType = AMDGPU::getShaderType(*F);
+
+  // Arguments to compute shaders are never a source of divergence.
+  if (ShaderType == ShaderType::COMPUTE)
+    return true;
+
+  // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
+  if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) ||
+      F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal))
+    return true;
+
+  // Everything else is in VGPRs.
+  return false;
+}
+
+///
+/// \returns true if the result of the value could potentially be
+/// different across workitems in a wavefront.
+bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
+
+  if (const Argument *A = dyn_cast<Argument>(V))
+    return !isArgPassedInSGPR(A);
+
+  // Loads from the private address space are divergent, because threads
+  // can execute the load instruction with the same inputs and get different
+  // results.
+  //
+  // All other loads are not divergent, because if threads issue loads with the
+  // same arguments, they will always get the same result.
+  if (const LoadInst *Load = dyn_cast<LoadInst>(V))
+    return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+
+  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
+    const TargetMachine &TM = getTLI()->getTargetMachine();
+    return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic);
+  }
+
+  // Assume all function calls are a source of divergence.
+  if (isa<CallInst>(V) || isa<InvokeInst>(V))
+    return true;
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index dee0a69..976afb0 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -60,6 +60,11 @@ public:
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
   unsigned getMaxInterleaveFactor(unsigned VF);
+
+  unsigned getCFInstrCost(unsigned Opcode);
+
+  int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
+  bool isSourceOfDivergence(const Value *V) const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index d918ac3..917efd1 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -185,7 +185,7 @@ protected:
   MachinePostDominatorTree *PDT;
   MachineLoopInfo *MLI;
   const R600InstrInfo *TII;
-  const AMDGPURegisterInfo *TRI;
+  const R600RegisterInfo *TRI;
 
   // PRINT FUNCTIONS
   /// Print the ordered Blocks.
@@ -881,7 +881,7 @@ bool AMDGPUCFGStructurizer::run() {
     } //while, "one iteration" over the function.
 
     MachineBasicBlock *EntryMBB =
-        GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
+        &*GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
     if (EntryMBB->succ_size() == 0) {
       Finish = true;
       DEBUG(
@@ -904,7 +904,7 @@ bool AMDGPUCFGStructurizer::run() {
   } while (!Finish && MakeProgress);
 
   // Misc wrap up to maintain the consistency of the Function representation.
-  wrapup(GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
+  wrapup(&*GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
 
   // Detach retired Block, release memory.
   for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
@@ -1164,7 +1164,7 @@ int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep,
 
   for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(),
       E = ContMBB.end(); It != E; ++It) {
-    (*It)->removeSuccessor(LoopHeader);
+    (*It)->removeSuccessor(LoopHeader, true);
   }
 
   numLoopcontPatternMatch += NumCont;
@@ -1353,7 +1353,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
     // If MigrateTrue is true, then TrueBB is the block being "branched into"
     // and if MigrateFalse is true, then FalseBB is the block being
     // "branched into"
-    // 
+    //
     // Here is the pseudo code for how I think the optimization should work:
     // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
     // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
@@ -1372,7 +1372,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
     // the late machine optimization passes, however if we implement
     // bool TargetRegisterInfo::requiresRegisterScavenging(
     //                                                const MachineFunction &MF)
-    // and have it return true, liveness will be tracked correctly 
+    // and have it return true, liveness will be tracked correctly
     // by generic optimization passes.  We will also need to make sure that
     // all of our target-specific passes that run after regalloc and before
     // the CFGStructurizer track liveness and we will need to modify this pass
@@ -1487,7 +1487,7 @@ void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
   );
   DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
 
-  DstMBB->removeSuccessor(SrcMBB);
+  DstMBB->removeSuccessor(SrcMBB, true);
   cloneSuccessorList(DstMBB, SrcMBB);
 
   removeSuccessor(SrcMBB);
@@ -1537,9 +1537,9 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
 
   if (TrueMBB) {
     MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end());
-    MBB->removeSuccessor(TrueMBB);
+    MBB->removeSuccessor(TrueMBB, true);
     if (LandMBB && TrueMBB->succ_size()!=0)
-      TrueMBB->removeSuccessor(LandMBB);
+      TrueMBB->removeSuccessor(LandMBB, true);
     retireBlock(TrueMBB);
     MLI->removeBlock(TrueMBB);
   }
@@ -1548,9 +1548,9 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
     insertInstrBefore(I, AMDGPU::ELSE);
     MBB->splice(I, FalseMBB, FalseMBB->begin(),
                    FalseMBB->end());
-    MBB->removeSuccessor(FalseMBB);
+    MBB->removeSuccessor(FalseMBB, true);
     if (LandMBB && FalseMBB->succ_size() != 0)
-      FalseMBB->removeSuccessor(LandMBB);
+      FalseMBB->removeSuccessor(LandMBB, true);
     retireBlock(FalseMBB);
     MLI->removeBlock(FalseMBB);
   }
@@ -1570,8 +1570,7 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
 
   insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
   insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
-  DstBlk->addSuccessor(LandMBB);
-  DstBlk->removeSuccessor(DstBlk);
+  DstBlk->replaceSuccessor(DstBlk, LandMBB);
 }
 
 
@@ -1592,7 +1591,7 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
   //now branchInst can be erase safely
   BranchMI->eraseFromParent();
   //now take care of successors, retire blocks
-  ExitingMBB->removeSuccessor(LandMBB);
+  ExitingMBB->removeSuccessor(LandMBB, true);
 }
 
 void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
@@ -1666,8 +1665,7 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
   replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
   //srcBlk, oldBlk, newBlk
 
-  PredMBB->removeSuccessor(MBB);
-  PredMBB->addSuccessor(CloneMBB);
+  PredMBB->replaceSuccessor(MBB, CloneMBB);
 
   // add all successor to cloneBlk
   cloneSuccessorList(CloneMBB, MBB);
@@ -1695,10 +1693,7 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
     );
     SpliceEnd = SrcMBB->end();
   } else {
-    DEBUG(
-      dbgs() << "migrateInstruction see branch instr\n" ;
-      BranchMI->dump();
-    );
+    DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
     SpliceEnd = BranchMI;
   }
   DEBUG(
@@ -1711,7 +1706,7 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
 
   DEBUG(
     dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
-      << "srcSize = " << SrcMBB->size() << "\n";
+      << "srcSize = " << SrcMBB->size() << '\n';
   );
 }
 
@@ -1743,7 +1738,7 @@ void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
   // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
   while ((BranchMI = getLoopendBlockBranchInstr(MBB))
           && isUncondBranch(BranchMI)) {
-    DEBUG(dbgs() << "Removing uncond branch instr"; BranchMI->dump(););
+    DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
     BranchMI->eraseFromParent();
   }
 }
@@ -1759,10 +1754,10 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
 
   MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
   assert(BranchMI && isCondBranch(BranchMI));
-  DEBUG(dbgs() << "Removing unneeded cond branch instr"; BranchMI->dump(););
+  DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
   BranchMI->eraseFromParent();
   SHOWNEWBLK(MBB1, "Removing redundant successor");
-  MBB->removeSuccessor(MBB1);
+  MBB->removeSuccessor(MBB1, true);
 }
 
 void AMDGPUCFGStructurizer::addDummyExitBlock(
diff --git a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 2018983..d9f753f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -28,7 +28,9 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -83,6 +85,7 @@ public:
     unsigned RegNo;
     int Modifiers;
     const MCRegisterInfo *TRI;
+    const MCSubtargetInfo *STI;
     bool IsForcedVOP3;
   };
 
@@ -102,7 +105,7 @@ public:
   }
 
   void addRegOperands(MCInst &Inst, unsigned N) const {
-    Inst.addOperand(MCOperand::createReg(getReg()));
+    Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), *Reg.STI)));
   }
 
   void addRegOrImmOperands(MCInst &Inst, unsigned N) const {
@@ -215,6 +218,10 @@ public:
            (isReg() && isRegClass(AMDGPU::SReg_64RegClassID));
   }
 
+  bool isSCSrc64() const {
+    return (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)) || isInlineImm();
+  }
+
   bool isVCSrc32() const {
     return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID));
   }
@@ -251,7 +258,22 @@ public:
     return EndLoc;
   }
 
-  void print(raw_ostream &OS) const override { }
+  void print(raw_ostream &OS) const override {
+    switch (Kind) {
+    case Register:
+      OS << "<register " << getReg() << " mods: " << Reg.Modifiers << '>';
+      break;
+    case Immediate:
+      OS << getImm();
+      break;
+    case Token:
+      OS << '\'' << getToken() << '\'';
+      break;
+    case Expression:
+      OS << "<expr " << *Expr << '>';
+      break;
+    }
+  }
 
   static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val, SMLoc Loc,
                                                   enum ImmTy Type = ImmTyNone,
@@ -278,10 +300,12 @@ public:
   static std::unique_ptr<AMDGPUOperand> CreateReg(unsigned RegNo, SMLoc S,
                                                   SMLoc E,
                                                   const MCRegisterInfo *TRI,
+                                                  const MCSubtargetInfo *STI,
                                                   bool ForceVOP3) {
     auto Op = llvm::make_unique<AMDGPUOperand>(Register);
     Op->Reg.RegNo = RegNo;
     Op->Reg.TRI = TRI;
+    Op->Reg.STI = STI;
     Op->Reg.Modifiers = -1;
     Op->Reg.IsForcedVOP3 = ForceVOP3;
     Op->StartLoc = S;
@@ -301,14 +325,32 @@ public:
   bool isDSOffset01() const;
   bool isSWaitCnt() const;
   bool isMubufOffset() const;
+  bool isSMRDOffset() const;
+  bool isSMRDLiteralOffset() const;
 };
 
 class AMDGPUAsmParser : public MCTargetAsmParser {
-  MCSubtargetInfo &STI;
   const MCInstrInfo &MII;
   MCAsmParser &Parser;
 
   unsigned ForcedEncodingSize;
+
+  bool isSI() const {
+    return AMDGPU::isSI(getSTI());
+  }
+
+  bool isCI() const {
+    return AMDGPU::isCI(getSTI());
+  }
+
+  bool isVI() const {
+    return AMDGPU::isVI(getSTI());
+  }
+
+  bool hasSGPR102_SGPR103() const {
+    return !isVI();
+  }
+
   /// @name Auto-generated Match Functions
   /// {
 
@@ -323,20 +365,34 @@ private:
   bool ParseDirectiveHSACodeObjectISA();
   bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header);
   bool ParseDirectiveAMDKernelCodeT();
+  bool ParseSectionDirectiveHSAText();
+  bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const;
+  bool ParseDirectiveAMDGPUHsaKernel();
+  bool ParseDirectiveAMDGPUHsaModuleGlobal();
+  bool ParseDirectiveAMDGPUHsaProgramGlobal();
+  bool ParseSectionDirectiveHSADataGlobalAgent();
+  bool ParseSectionDirectiveHSADataGlobalProgram();
+  bool ParseSectionDirectiveHSARodataReadonlyAgent();
 
 public:
-  AMDGPUAsmParser(MCSubtargetInfo &STI, MCAsmParser &_Parser,
+public:
+  enum AMDGPUMatchResultTy {
+    Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY
+  };
+
+  AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser,
                const MCInstrInfo &MII,
                const MCTargetOptions &Options)
-      : MCTargetAsmParser(), STI(STI), MII(MII), Parser(_Parser),
-        ForcedEncodingSize(0){
+      : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser),
+        ForcedEncodingSize(0) {
+    MCAsmParserExtension::Initialize(Parser);
 
-    if (STI.getFeatureBits().none()) {
+    if (getSTI().getFeatureBits().none()) {
       // Set default features.
-      STI.ToggleFeature("SOUTHERN_ISLANDS");
+      copySTI().ToggleFeature("SOUTHERN_ISLANDS");
     }
 
-    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
   }
 
   AMDGPUTargetStreamer &getTargetStreamer() {
@@ -420,10 +476,10 @@ struct OptionalOperand {
 
 }
 
-static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) {
+static int getRegClass(bool IsVgpr, unsigned RegWidth) {
   if (IsVgpr) {
     switch (RegWidth) {
-      default: llvm_unreachable("Unknown register width");
+      default: return -1;
       case 1: return AMDGPU::VGPR_32RegClassID;
       case 2: return AMDGPU::VReg_64RegClassID;
       case 3: return AMDGPU::VReg_96RegClassID;
@@ -434,7 +490,7 @@ static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) {
   }
 
   switch (RegWidth) {
-    default: llvm_unreachable("Unknown register width");
+    default: return -1;
     case 1: return AMDGPU::SGPR_32RegClassID;
     case 2: return AMDGPU::SGPR_64RegClassID;
     case 4: return AMDGPU::SReg_128RegClassID;
@@ -443,16 +499,16 @@ static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) {
   }
 }
 
-static unsigned getRegForName(const StringRef &RegName) {
+static unsigned getRegForName(StringRef RegName) {
 
   return StringSwitch<unsigned>(RegName)
     .Case("exec", AMDGPU::EXEC)
     .Case("vcc", AMDGPU::VCC)
-    .Case("flat_scr", AMDGPU::FLAT_SCR)
+    .Case("flat_scratch", AMDGPU::FLAT_SCR)
     .Case("m0", AMDGPU::M0)
     .Case("scc", AMDGPU::SCC)
-    .Case("flat_scr_lo", AMDGPU::FLAT_SCR_LO)
-    .Case("flat_scr_hi", AMDGPU::FLAT_SCR_HI)
+    .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
+    .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
     .Case("vcc_lo", AMDGPU::VCC_LO)
     .Case("vcc_hi", AMDGPU::VCC_HI)
     .Case("exec_lo", AMDGPU::EXEC_LO)
@@ -464,12 +520,14 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End
   const AsmToken Tok = Parser.getTok();
   StartLoc = Tok.getLoc();
   EndLoc = Tok.getEndLoc();
-  const StringRef &RegName = Tok.getString();
+  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+
+  StringRef RegName = Tok.getString();
   RegNo = getRegForName(RegName);
 
   if (RegNo) {
     Parser.Lex();
-    return false;
+    return !subtargetHasRegister(*TRI, RegNo);
   }
 
   // Match vgprs and sgprs
@@ -514,16 +572,24 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End
       RegIndexInClass = RegLo;
     } else {
       // SGPR registers are aligned.  Max alignment is 4 dwords.
-      RegIndexInClass = RegLo / std::min(RegWidth, 4u);
+      unsigned Size = std::min(RegWidth, 4u);
+      if (RegLo % Size != 0)
+        return true;
+
+      RegIndexInClass = RegLo / Size;
     }
   }
 
-  const MCRegisterInfo *TRC = getContext().getRegisterInfo();
-  unsigned RC = getRegClass(IsVgpr, RegWidth);
-  if (RegIndexInClass > TRC->getRegClass(RC).getNumRegs())
+  int RCID = getRegClass(IsVgpr, RegWidth);
+  if (RCID == -1)
     return true;
-  RegNo = TRC->getRegClass(RC).getRegister(RegIndexInClass);
-  return false;
+
+  const MCRegisterClass RC = TRI->getRegClass(RCID);
+  if (RegIndexInClass >= RC.getNumRegs())
+    return true;
+
+  RegNo = RC.getRegister(RegIndexInClass);
+  return !subtargetHasRegister(*TRI, RegNo);
 }
 
 unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
@@ -534,6 +600,11 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
       (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3)))
     return Match_InvalidOperand;
 
+  if ((TSFlags & SIInstrFlags::VOP3) &&
+      (TSFlags & SIInstrFlags::VOPAsmPrefer32Bit) &&
+      getForcedEncodingSize() != 64)
+    return Match_PreferE32;
+
   return Match_Success;
 }
 
@@ -549,7 +620,7 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     default: break;
     case Match_Success:
       Inst.setLoc(IDLoc);
-      Out.EmitInstruction(Inst, STI);
+      Out.EmitInstruction(Inst, getSTI());
       return false;
     case Match_MissingFeature:
       return Error(IDLoc, "instruction not supported on this GPU");
@@ -592,6 +663,9 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       }
       return Error(ErrorLoc, "invalid operand for instruction");
     }
+    case Match_PreferE32:
+      return Error(IDLoc, "internal error: instruction without _e64 suffix "
+                          "should be encoded as e32");
   }
   llvm_unreachable("Implement any new match types added!");
 }
@@ -640,7 +714,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
   // If this directive has no arguments, then use the ISA version for the
   // targeted GPU.
   if (getLexer().is(AsmToken::EndOfStatement)) {
-    AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(STI.getFeatureBits());
+    AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits());
     getTargetStreamer().EmitDirectiveHSACodeObjectISA(Isa.Major, Isa.Minor,
                                                       Isa.Stepping,
                                                       "AMD", "AMDGPU");
@@ -852,7 +926,7 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
 bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
 
   amd_kernel_code_t Header;
-  AMDGPU::initDefaultAMDKernelCodeT(Header, STI.getFeatureBits());
+  AMDGPU::initDefaultAMDKernelCodeT(Header, getSTI().getFeatureBits());
 
   while (true) {
 
@@ -882,6 +956,64 @@ bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
   return false;
 }
 
+bool AMDGPUAsmParser::ParseSectionDirectiveHSAText() {
+  getParser().getStreamer().SwitchSection(
+      AMDGPU::getHSATextSection(getContext()));
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() {
+  if (getLexer().isNot(AsmToken::Identifier))
+    return TokError("expected symbol name");
+
+  StringRef KernelName = Parser.getTok().getString();
+
+  getTargetStreamer().EmitAMDGPUSymbolType(KernelName,
+                                           ELF::STT_AMDGPU_HSA_KERNEL);
+  Lex();
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaModuleGlobal() {
+  if (getLexer().isNot(AsmToken::Identifier))
+    return TokError("expected symbol name");
+
+  StringRef GlobalName = Parser.getTok().getIdentifier();
+
+  getTargetStreamer().EmitAMDGPUHsaModuleScopeGlobal(GlobalName);
+  Lex();
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaProgramGlobal() {
+  if (getLexer().isNot(AsmToken::Identifier))
+    return TokError("expected symbol name");
+
+  StringRef GlobalName = Parser.getTok().getIdentifier();
+
+  getTargetStreamer().EmitAMDGPUHsaProgramScopeGlobal(GlobalName);
+  Lex();
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalAgent() {
+  getParser().getStreamer().SwitchSection(
+      AMDGPU::getHSADataGlobalAgentSection(getContext()));
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalProgram() {
+  getParser().getStreamer().SwitchSection(
+      AMDGPU::getHSADataGlobalProgramSection(getContext()));
+  return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSARodataReadonlyAgent() {
+  getParser().getStreamer().SwitchSection(
+      AMDGPU::getHSARodataReadonlyAgentSection(getContext()));
+  return false;
+}
+
 bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getString();
 
@@ -894,6 +1026,55 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   if (IDVal == ".amd_kernel_code_t")
     return ParseDirectiveAMDKernelCodeT();
 
+  if (IDVal == ".hsatext" || IDVal == ".text")
+    return ParseSectionDirectiveHSAText();
+
+  if (IDVal == ".amdgpu_hsa_kernel")
+    return ParseDirectiveAMDGPUHsaKernel();
+
+  if (IDVal == ".amdgpu_hsa_module_global")
+    return ParseDirectiveAMDGPUHsaModuleGlobal();
+
+  if (IDVal == ".amdgpu_hsa_program_global")
+    return ParseDirectiveAMDGPUHsaProgramGlobal();
+
+  if (IDVal == ".hsadata_global_agent")
+    return ParseSectionDirectiveHSADataGlobalAgent();
+
+  if (IDVal == ".hsadata_global_program")
+    return ParseSectionDirectiveHSADataGlobalProgram();
+
+  if (IDVal == ".hsarodata_readonly_agent")
+    return ParseSectionDirectiveHSARodataReadonlyAgent();
+
+  return true;
+}
+
+bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
+                                           unsigned RegNo) const {
+  if (isCI())
+    return true;
+
+  if (isSI()) {
+    // No flat_scr
+    switch (RegNo) {
+    case AMDGPU::FLAT_SCR:
+    case AMDGPU::FLAT_SCR_LO:
+    case AMDGPU::FLAT_SCR_HI:
+      return false;
+    default:
+      return true;
+    }
+  }
+
+  // VI only has 102 SGPRs, so make sure we aren't trying to use the 2 more that
+  // SI/CI have.
+  for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true);
+       R.isValid(); ++R) {
+    if (*R == RegNo)
+      return false;
+  }
+
   return true;
 }
 
@@ -943,13 +1124,11 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
       int64_t IntVal;
       if (getParser().parseAbsoluteExpression(IntVal))
         return MatchOperand_ParseFail;
-      APInt IntVal32(32, IntVal);
-      if (IntVal32.getSExtValue() != IntVal) {
+      if (!isInt<32>(IntVal) && !isUInt<32>(IntVal)) {
         Error(S, "invalid immediate: only 32-bit values are legal");
         return MatchOperand_ParseFail;
       }
 
-      IntVal = IntVal32.getSExtValue();
       if (Negate)
         IntVal *= -1;
       Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S));
@@ -1002,7 +1181,7 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
 
 
         Operands.push_back(AMDGPUOperand::CreateReg(
-            RegNo, S, E, getContext().getRegisterInfo(),
+            RegNo, S, E, getContext().getRegisterInfo(), &getSTI(),
             isForcedVOP3()));
 
         if (HasModifiers || Modifiers) {
@@ -1571,6 +1750,23 @@ AMDGPUAsmParser::parseR128(OperandVector &Operands) {
 }
 
 //===----------------------------------------------------------------------===//
+// smrd
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUOperand::isSMRDOffset() const {
+
+  // FIXME: Support 20-bit offsets on VI.  We need to to pass subtarget
+  // information here.
+  return isImm() && isUInt<8>(getImm());
+}
+
+bool AMDGPUOperand::isSMRDLiteralOffset() const {
+  // 32-bit literals are only supported on CI and we only want to use them
+  // when the offset is > 8-bits.
+  return isImm() && !isUInt<8>(getImm()) && isUInt<32>(getImm());
+}
+
+//===----------------------------------------------------------------------===//
 // vop3
 //===----------------------------------------------------------------------===//
 
@@ -1653,8 +1849,12 @@ AMDGPUAsmParser::parseVOP3OptionalOps(OperandVector &Operands) {
 }
 
 void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
-  ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1);
-  unsigned i = 2;
+
+  unsigned i = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  if (Desc.getNumDefs() > 0) {
+    ((AMDGPUOperand &)*Operands[i++]).addRegOperands(Inst, 1);
+  }
 
   std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td
index 2f5fdbe..88a090d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td
@@ -8,6 +8,22 @@
 //===----------------------------------------------------------------------===//
 // Instruction definitions for CI and newer.
 //===----------------------------------------------------------------------===//
+// Remaining instructions:
+// S_CBRANCH_CDBGUSER
+// S_CBRANCH_CDBGSYS
+// S_CBRANCH_CDBGSYS_OR_USER
+// S_CBRANCH_CDBGSYS_AND_USER
+// DS_NOP
+// DS_GWS_SEMA_RELEASE_ALL
+// DS_WRAP_RTN_B32
+// DS_CNDXCHG32_RTN_B64
+// DS_WRITE_B96
+// DS_WRITE_B128
+// DS_CONDXCHG32_RTN_B128
+// DS_READ_B96
+// DS_READ_B128
+// BUFFER_LOAD_DWORDX3
+// BUFFER_STORE_DWORDX3
 
 
 def isCIVI : Predicate <
@@ -23,6 +39,7 @@ def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
 
 let SubtargetPredicate = isCIVI in {
 
+let SchedRW = [WriteDoubleAdd] in {
 defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64",
   VOP_F64_F64, ftrunc
 >;
@@ -35,82 +52,218 @@ defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64",
 defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64",
   VOP_F64_F64, frint
 >;
+} // End SchedRW = [WriteDoubleAdd]
+
+let SchedRW = [WriteQuarterRate32] in {
 defm V_LOG_LEGACY_F32 : VOP1Inst <vop1<0x45, 0x4c>, "v_log_legacy_f32",
   VOP_F32_F32
 >;
 defm V_EXP_LEGACY_F32 : VOP1Inst <vop1<0x46, 0x4b>, "v_exp_legacy_f32",
   VOP_F32_F32
 >;
+} // End SchedRW = [WriteQuarterRate32]
+
+//===----------------------------------------------------------------------===//
+// VOP3 Instructions
+//===----------------------------------------------------------------------===//
+
+defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8",
+  VOP_I32_I32_I32
+>;
+defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8",
+  VOP_I32_I32_I32
+>;
+defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8",
+  VOP_I32_I32_I32
+>;
+
+let isCommutable = 1 in {
+defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32",
+  VOP_I64_I32_I32_I64
+>;
+
+// XXX - Does this set VCC?
+defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
+  VOP_I64_I32_I32_I64
+>;
+} // End isCommutable = 1
+
+
+//===----------------------------------------------------------------------===//
+// DS Instructions
+//===----------------------------------------------------------------------===//
+defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">;
+
+// DS_CONDXCHG32_RTN_B64
+// DS_CONDXCHG32_RTN_B128
+
+//===----------------------------------------------------------------------===//
+// SMRD Instructions
+//===----------------------------------------------------------------------===//
+
+defm S_DCACHE_INV_VOL : SMRD_Inval <smrd<0x1d, 0x22>,
+  "s_dcache_inv_vol", int_amdgcn_s_dcache_inv_vol>;
+
+//===----------------------------------------------------------------------===//
+// MUBUF Instructions
+//===----------------------------------------------------------------------===//
+
+defm BUFFER_WBINVL1_VOL : MUBUF_Invalidate <mubuf<0x70, 0x3f>,
+  "buffer_wbinvl1_vol", int_amdgcn_buffer_wbinvl1_vol
+>;
 
 //===----------------------------------------------------------------------===//
 // Flat Instructions
 //===----------------------------------------------------------------------===//
 
-def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x8, "flat_load_ubyte", VGPR_32>;
-def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x9, "flat_load_sbyte", VGPR_32>;
-def FLAT_LOAD_USHORT : FLAT_Load_Helper <0xa, "flat_load_ushort", VGPR_32>;
-def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0xb, "flat_load_sshort", VGPR_32>;
-def FLAT_LOAD_DWORD : FLAT_Load_Helper <0xc, "flat_load_dword", VGPR_32>;
-def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0xd, "flat_load_dwordx2", VReg_64>;
-def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0xe, "flat_load_dwordx4", VReg_128>;
-def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0xf, "flat_load_dwordx3", VReg_96>;
-def FLAT_STORE_BYTE : FLAT_Store_Helper <0x18, "flat_store_byte", VGPR_32>;
-def FLAT_STORE_SHORT : FLAT_Store_Helper <0x1a, "flat_store_short", VGPR_32>;
-def FLAT_STORE_DWORD : FLAT_Store_Helper <0x1c, "flat_store_dword", VGPR_32>;
-def FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
-  0x1d, "flat_store_dwordx2", VReg_64
+defm FLAT_LOAD_UBYTE : FLAT_Load_Helper <
+  flat<0x8, 0x10>, "flat_load_ubyte", VGPR_32
+>;
+defm FLAT_LOAD_SBYTE : FLAT_Load_Helper <
+  flat<0x9, 0x11>, "flat_load_sbyte", VGPR_32
+>;
+defm FLAT_LOAD_USHORT : FLAT_Load_Helper <
+  flat<0xa, 0x12>, "flat_load_ushort", VGPR_32
+>;
+defm FLAT_LOAD_SSHORT : FLAT_Load_Helper <
+  flat<0xb, 0x13>, "flat_load_sshort", VGPR_32>
+;
+defm FLAT_LOAD_DWORD : FLAT_Load_Helper <
+  flat<0xc, 0x14>, "flat_load_dword", VGPR_32
+>;
+defm FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <
+  flat<0xd, 0x15>, "flat_load_dwordx2", VReg_64
+>;
+defm FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <
+  flat<0xe, 0x17>, "flat_load_dwordx4", VReg_128
+>;
+defm FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <
+  flat<0xf, 0x16>, "flat_load_dwordx3", VReg_96
+>;
+defm FLAT_STORE_BYTE : FLAT_Store_Helper <
+  flat<0x18>, "flat_store_byte", VGPR_32
+>;
+defm FLAT_STORE_SHORT : FLAT_Store_Helper <
+  flat <0x1a>, "flat_store_short", VGPR_32
+>;
+defm FLAT_STORE_DWORD : FLAT_Store_Helper <
+  flat<0x1c>, "flat_store_dword", VGPR_32
+>;
+defm FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
+  flat<0x1d>, "flat_store_dwordx2", VReg_64
+>;
+defm FLAT_STORE_DWORDX4 : FLAT_Store_Helper <
+  flat<0x1e, 0x1f>, "flat_store_dwordx4", VReg_128
 >;
-def FLAT_STORE_DWORDX4 : FLAT_Store_Helper <
-  0x1e, "flat_store_dwordx4", VReg_128
+defm FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
+  flat<0x1f, 0x1e>, "flat_store_dwordx3", VReg_96
 >;
-def FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
-  0x1f, "flat_store_dwordx3", VReg_96
+defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <
+  flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32
 >;
-defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <0x30, "flat_atomic_swap", VGPR_32>;
 defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC <
-  0x31, "flat_atomic_cmpswap", VGPR_32, VReg_64
->;
-defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <0x32, "flat_atomic_add", VGPR_32>;
-defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <0x33, "flat_atomic_sub", VGPR_32>;
-defm FLAT_ATOMIC_RSUB : FLAT_ATOMIC <0x34, "flat_atomic_rsub", VGPR_32>;
-defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <0x35, "flat_atomic_smin", VGPR_32>;
-defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <0x36, "flat_atomic_umin", VGPR_32>;
-defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <0x37, "flat_atomic_smax", VGPR_32>;
-defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <0x38, "flat_atomic_umax", VGPR_32>;
-defm FLAT_ATOMIC_AND : FLAT_ATOMIC <0x39, "flat_atomic_and", VGPR_32>;
-defm FLAT_ATOMIC_OR : FLAT_ATOMIC <0x3a, "flat_atomic_or", VGPR_32>;
-defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <0x3b, "flat_atomic_xor", VGPR_32>;
-defm FLAT_ATOMIC_INC : FLAT_ATOMIC <0x3c, "flat_atomic_inc", VGPR_32>;
-defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <0x3d, "flat_atomic_dec", VGPR_32>;
-defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC <
-  0x3e, "flat_atomic_fcmpswap", VGPR_32, VReg_64
+  flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, VReg_64
+>;
+defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <
+  flat<0x32, 0x42>, "flat_atomic_add", VGPR_32
+>;
+defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <
+  flat<0x33, 0x43>, "flat_atomic_sub", VGPR_32
+>;
+defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <
+  flat<0x35, 0x44>, "flat_atomic_smin", VGPR_32
+>;
+defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <
+  flat<0x36, 0x45>, "flat_atomic_umin", VGPR_32
+>;
+defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <
+  flat<0x37, 0x46>, "flat_atomic_smax", VGPR_32
+>;
+defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <
+  flat<0x38, 0x47>, "flat_atomic_umax", VGPR_32
+>;
+defm FLAT_ATOMIC_AND : FLAT_ATOMIC <
+  flat<0x39, 0x48>, "flat_atomic_and", VGPR_32
+>;
+defm FLAT_ATOMIC_OR : FLAT_ATOMIC <
+  flat<0x3a, 0x49>, "flat_atomic_or", VGPR_32
+>;
+defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <
+  flat<0x3b, 0x4a>, "flat_atomic_xor", VGPR_32
+>;
+defm FLAT_ATOMIC_INC : FLAT_ATOMIC <
+  flat<0x3c, 0x4b>, "flat_atomic_inc", VGPR_32
+>;
+defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <
+  flat<0x3d, 0x4c>, "flat_atomic_dec", VGPR_32
+>;
+defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <
+  flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64
 >;
-defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <0x3f, "flat_atomic_fmin", VGPR_32>;
-defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <0x40, "flat_atomic_fmax", VGPR_32>;
-defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <0x50, "flat_atomic_swap_x2", VReg_64>;
 defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC <
-  0x51, "flat_atomic_cmpswap_x2", VReg_64, VReg_128
->;
-defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <0x52, "flat_atomic_add_x2", VReg_64>;
-defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <0x53, "flat_atomic_sub_x2", VReg_64>;
-defm FLAT_ATOMIC_RSUB_X2 : FLAT_ATOMIC <0x54, "flat_atomic_rsub_x2", VReg_64>;
-defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <0x55, "flat_atomic_smin_x2", VReg_64>;
-defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <0x56, "flat_atomic_umin_x2", VReg_64>;
-defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <0x57, "flat_atomic_smax_x2", VReg_64>;
-defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <0x58, "flat_atomic_umax_x2", VReg_64>;
-defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <0x59, "flat_atomic_and_x2", VReg_64>;
-defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <0x5a, "flat_atomic_or_x2", VReg_64>;
-defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <0x5b, "flat_atomic_xor_x2", VReg_64>;
-defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <0x5c, "flat_atomic_inc_x2", VReg_64>;
-defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <0x5d, "flat_atomic_dec_x2", VReg_64>;
-defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC <
-  0x5e, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128
+  flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, VReg_128
+>;
+defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <
+  flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64
+>;
+defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <
+  flat<0x53, 0x63>, "flat_atomic_sub_x2", VReg_64
+>;
+defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <
+  flat<0x55, 0x64>, "flat_atomic_smin_x2", VReg_64
+>;
+defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <
+  flat<0x56, 0x65>, "flat_atomic_umin_x2", VReg_64
+>;
+defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <
+  flat<0x57, 0x66>, "flat_atomic_smax_x2", VReg_64
+>;
+defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <
+  flat<0x58, 0x67>, "flat_atomic_umax_x2", VReg_64
+>;
+defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <
+  flat<0x59, 0x68>, "flat_atomic_and_x2", VReg_64
+>;
+defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <
+  flat<0x5a, 0x69>, "flat_atomic_or_x2", VReg_64
+>;
+defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <
+  flat<0x5b, 0x6a>, "flat_atomic_xor_x2", VReg_64
+>;
+defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <
+  flat<0x5c, 0x6b>, "flat_atomic_inc_x2", VReg_64
+>;
+defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <
+  flat<0x5d, 0x6c>, "flat_atomic_dec_x2", VReg_64
 >;
-defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <0x5f, "flat_atomic_fmin_x2", VReg_64>;
-defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <0x60, "flat_atomic_fmax_x2", VReg_64>;
 
 } // End SubtargetPredicate = isCIVI
 
+// CI Only flat instructions
+
+let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst in {
+
+defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC <
+  flat<0x3e>, "flat_atomic_fcmpswap", VGPR_32, VReg_64
+>;
+defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <
+  flat<0x3f>, "flat_atomic_fmin", VGPR_32
+>;
+defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <
+  flat<0x40>, "flat_atomic_fmax", VGPR_32
+>;
+defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC <
+  flat<0x5e>, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128
+>;
+defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <
+  flat<0x5f>, "flat_atomic_fmin_x2", VReg_64
+>;
+defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <
+  flat<0x60>, "flat_atomic_fmax_x2", VReg_64
+>;
+
+} // End let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst
+
 //===----------------------------------------------------------------------===//
 // Flat Patterns
 //===----------------------------------------------------------------------===//
@@ -147,3 +300,80 @@ def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>;
 
 } // End HasFlatAddressSpace predicate
 
+let Predicates = [isCI] in {
+
+// Convert (x - floor(x)) to fract(x)
+def : Pat <
+  (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
+             (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
+  (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+// Convert (x + (-floor(x))) to fract(x)
+def : Pat <
+  (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
+             (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
+  (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+} // End Predicates = [isCI]
+
+
+//===----------------------------------------------------------------------===//
+// Patterns to generate flat for global
+//===----------------------------------------------------------------------===//
+
+def useFlatForGlobal : Predicate <
+  "Subtarget->useFlatForGlobal() || "
+  "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">;
+
+let Predicates = [useFlatForGlobal] in {
+
+// 1. Offset as 20bit DWORD immediate
+def : Pat <
+  (SIload_constant v4i32:$sbase, IMM20bit:$offset),
+  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
+>;
+
+// Patterns for global loads with no offset
+class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+  (vt (node i64:$addr)),
+  (inst $addr, 0, 0, 0)
+>;
+
+def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>;
+
+class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+  (node vt:$data, i64:$addr),
+  (inst $data, $addr, 0, 0, 0)
+>;
+
+def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>;
+def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>;
+def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>;
+
+class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+  (vt (node i64:$addr, vt:$data)),
+  (inst $addr, $data, 0, 0)
+>;
+
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
+
+} // End Predicates = [useFlatForGlobal]
diff --git a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td
index ba4df82..a6c3785 100644
--- a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td
@@ -82,6 +82,10 @@ def RAT_STORE_DWORD32 : RAT_STORE_DWORD <R600_TReg32_X, i32, 0x1>;
 def RAT_STORE_DWORD64 : RAT_STORE_DWORD <R600_Reg64, v2i32, 0x3>;
 def RAT_STORE_DWORD128 : RAT_STORE_DWORD <R600_Reg128, v4i32, 0xf>;
 
+def RAT_STORE_TYPED_cm: CF_MEM_RAT_STORE_TYPED<0> {
+  let eop = 0; // This bit is not used on Cayman.
+}
+
 class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
     : VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> {
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 7adcd46..779a14e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -40,6 +40,15 @@ class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name,
     : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins,
                  "MEM_RAT "#name, pattern>;
 
+class CF_MEM_RAT_STORE_TYPED<bits<1> has_eop>
+    : CF_MEM_RAT <0x1, ?, (ins R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr,
+                           i32imm:$rat_id, InstFlag:$eop),
+                  "STORE_TYPED RAT($rat_id) $rw_gpr, $index_gpr"
+                               #!if(has_eop, ", $eop", ""),
+                  [(int_r600_rat_store_typed R600_Reg128:$rw_gpr,
+                                             R600_Reg128:$index_gpr,
+                                             (i32 imm:$rat_id))]>;
+
 def RAT_MSKOR : CF_MEM_RAT <0x11, 0,
   (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
   "MSKOR $rw_gpr.XW, $index_gpr",
@@ -105,6 +114,8 @@ def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf,
   [(global_store v4i32:$rw_gpr, i32:$index_gpr)]
 >;
 
+def RAT_STORE_TYPED_eg: CF_MEM_RAT_STORE_TYPED<1>;
+
 } // End usesCustomInserter = 1
 
 class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index e811d5c..a187de8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -283,8 +284,13 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) {
     O << "4.0";
   else if (Imm == DoubleToBits(-4.0))
     O << "-4.0";
-  else
-    llvm_unreachable("64-bit literal constants not supported");
+  else {
+    assert(isUInt<32>(Imm));
+
+    // In rare situations, we will have a 32-bit literal in a 64-bit
+    // operand. This is technically allowed for the encoding of s_mov_b64.
+    O << formatHex(static_cast<uint64_t>(Imm));
+  }
 }
 
 void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
@@ -592,11 +598,11 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
     } else {
       unsigned Stream = (SImm16 >> 8) & 0x3;
       if (Op == 1)
-	O << "cut";
+        O << "cut";
       else if (Op == 2)
-	O << "emit";
+        O << "emit";
       else if (Op == 3)
-	O << "emit-cut";
+        O << "emit-cut";
       O << " stream " << Stream;
     }
     O << "), [m0] ";
diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index 14fb511..90541d8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -13,9 +13,7 @@
 #ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
 #define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInstPrinter.h"
-#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 4434d9b..60e8c8f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -99,14 +99,22 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
     case AMDGPU::fixup_si_rodata: {
       uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
-      *Dst = Value;
-      break;
-    }
-
-    case AMDGPU::fixup_si_end_of_text: {
-      uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
-      // The value points to the last instruction in the text section, so we
-      // need to add 4 bytes to get to the start of the constants.
+      // We emit constant data at the end of the text section and generate its
+      // address using the following code sequence:
+      // s_getpc_b64 s[0:1]
+      // s_add_u32 s0, s0, $symbol
+      // s_addc_u32 s1, s1, 0
+      //
+      // s_getpc_b64 returns the address of the s_add_u32 instruction and then
+      // the fixup replaces $symbol with a literal constant, which is a
+      // pc-relative  offset from the encoding of the $symbol operand to the
+      // constant data.
+      //
+      // What we want here is an offset from the start of the s_add_u32
+      // instruction to the constant data, but since the encoding of $symbol
+      // starts 4 bytes after the start of the add instruction, we end up
+      // with an offset that is 4 bytes too small.  This requires us to
+      // add 4 to the fixup value before applying it.
       *Dst = Value + 4;
       break;
     }
@@ -136,8 +144,7 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
   const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = {
     // name                   offset bits  flags
     { "fixup_si_sopp_br",     0,     16,   MCFixupKindInfo::FKF_IsPCRel },
-    { "fixup_si_rodata",      0,     32,   0 },
-    { "fixup_si_end_of_text", 0,     32,   MCFixupKindInfo::FKF_IsPCRel }
+    { "fixup_si_rodata",      0,     32,   MCFixupKindInfo::FKF_IsPCRel }
   };
 
   if (Kind < FirstTargetFixupKind)
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
new file mode 100644
index 0000000..9ff9fe7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
@@ -0,0 +1,26 @@
+//===-------- AMDGPUELFStreamer.cpp - ELF Object Output -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUELFStreamer.h"
+#include "Utils/AMDGPUBaseInfo.h"
+
+using namespace llvm;
+
+void AMDGPUELFStreamer::InitSections(bool NoExecStack) {
+  // Start with the .hsatext section by default.
+  SwitchSection(AMDGPU::getHSATextSection(getContext()));
+}
+
+MCELFStreamer *llvm::createAMDGPUELFStreamer(MCContext &Context,
+                                           MCAsmBackend &MAB,
+                                           raw_pwrite_stream &OS,
+                                           MCCodeEmitter *Emitter,
+                                           bool RelaxAll) {
+  return new AMDGPUELFStreamer(Context, MAB, OS, Emitter);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
new file mode 100644
index 0000000..488d7e7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
@@ -0,0 +1,40 @@
+//===-------- AMDGPUELFStreamer.h - ELF Object Output ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a custom MCELFStreamer which allows us to insert some hooks before
+// emitting data into an actual object file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCSubtargetInfo;
+
+class AMDGPUELFStreamer : public MCELFStreamer {
+public:
+  AMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS,
+                  MCCodeEmitter *Emitter)
+      : MCELFStreamer(Context, MAB, OS, Emitter) { }
+
+  virtual void InitSections(bool NoExecStac) override;
+};
+
+MCELFStreamer *createAMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     raw_pwrite_stream &OS,
+                                     MCCodeEmitter *Emitter, bool RelaxAll);
+} // namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
index 01021d6..59a9178 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
@@ -21,9 +21,6 @@ enum Fixups {
   /// fixup for global addresses with constant initializers
   fixup_si_rodata,
 
-  /// fixup for offset from instruction to end of text section
-  fixup_si_end_of_text,
-
   // Marker
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 028a86d..68b1d1a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -22,13 +22,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
   InlineAsmEnd = ";#ASMEND";
 
   //===--- Data Emission Directives -------------------------------------===//
-  ZeroDirective = ".zero";
-  AsciiDirective = ".ascii\t";
-  AscizDirective = ".asciz\t";
-  Data8bitsDirective = ".byte\t";
-  Data16bitsDirective = ".short\t";
-  Data32bitsDirective = ".long\t";
-  Data64bitsDirective = ".quad\t";
   SunStyleELFSectionSwitchSyntax = true;
   UsesELFSectionDirectiveForBSS = true;
 
@@ -41,3 +34,10 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
   //===--- Dwarf Emission Directives -----------------------------------===//
   SupportsDebugInformation = true;
 }
+
+bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const {
+  return SectionName == ".hsatext" || SectionName == ".hsadata_global_agent" ||
+         SectionName == ".hsadata_global_program" ||
+         SectionName == ".hsarodata_readonly_agent" ||
+         MCAsmInfo::shouldOmitSectionDirective(SectionName);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
index a5bac51..a546961 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -21,12 +21,13 @@ class Triple;
 
 // If you need to create another MCAsmInfo class, which inherits from MCAsmInfo,
 // you will need to make sure your new class sets PrivateGlobalPrefix to
-// a prefix that won't appeary in a fuction name.  The default value
+// a prefix that won't appear in a function name.  The default value
 // for PrivateGlobalPrefix is 'L', so it will consider any function starting
 // with 'L' as a local symbol.
 class AMDGPUMCAsmInfo : public MCAsmInfoELF {
 public:
   explicit AMDGPUMCAsmInfo(const Triple &TT);
+  bool shouldOmitSectionDirective(StringRef SectionName) const override;
 };
 } // namespace llvm
 #endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index c709741..f704094 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMCTargetDesc.h"
+#include "AMDGPUELFStreamer.h"
 #include "AMDGPUMCAsmInfo.h"
 #include "AMDGPUTargetStreamer.h"
 #include "InstPrinter/AMDGPUInstPrinter.h"
@@ -85,6 +86,15 @@ static MCTargetStreamer * createAMDGPUObjectTargetStreamer(
   return new AMDGPUTargetELFStreamer(S);
 }
 
+static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
+                                    MCAsmBackend &MAB, raw_pwrite_stream &OS,
+                                    MCCodeEmitter *Emitter, bool RelaxAll) {
+  if (T.getOS() == Triple::AMDHSA)
+    return createAMDGPUELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+
+  return createELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+}
+
 extern "C" void LLVMInitializeAMDGPUTargetMC() {
   for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) {
     RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T);
@@ -95,6 +105,7 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() {
     TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo);
     TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter);
     TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend);
+    TargetRegistry::RegisterELFStreamer(*T, createMCStreamer);
   }
 
   // R600 specific registration
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 09e6cb1..b91134d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -13,6 +13,7 @@
 
 #include "AMDGPUTargetStreamer.h"
 #include "SIDefines.h"
+#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFStreamer.h"
@@ -220,6 +221,26 @@ AMDGPUTargetAsmStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
 
 }
 
+void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
+                                                   unsigned Type) {
+  switch (Type) {
+    default: llvm_unreachable("Invalid AMDGPU symbol type");
+    case ELF::STT_AMDGPU_HSA_KERNEL:
+      OS << "\t.amdgpu_hsa_kernel " << SymbolName << '\n' ;
+      break;
+  }
+}
+
+void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaModuleScopeGlobal(
+    StringRef GlobalName) {
+  OS << "\t.amdgpu_hsa_module_global " << GlobalName << '\n';
+}
+
+void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal(
+    StringRef GlobalName) {
+  OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n';
+}
+
 //===----------------------------------------------------------------------===//
 // AMDGPUTargetELFStreamer
 //===----------------------------------------------------------------------===//
@@ -291,7 +312,35 @@ AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
 
   MCStreamer &OS = getStreamer();
   OS.PushSection();
-  OS.SwitchSection(OS.getContext().getObjectFileInfo()->getTextSection());
+  // The MCObjectFileInfo that is available to the assembler is a generic
+  // implementation and not AMDGPUHSATargetObjectFile, so we can't use
+  // MCObjectFileInfo::getTextSection() here for fetching the HSATextSection.
+  OS.SwitchSection(AMDGPU::getHSATextSection(OS.getContext()));
   OS.EmitBytes(StringRef((const char*)&Header, sizeof(Header)));
   OS.PopSection();
 }
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
+                                                   unsigned Type) {
+  MCSymbolELF *Symbol = cast<MCSymbolELF>(
+      getStreamer().getContext().getOrCreateSymbol(SymbolName));
+  Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL);
+}
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUHsaModuleScopeGlobal(
+    StringRef GlobalName) {
+
+  MCSymbolELF *Symbol = cast<MCSymbolELF>(
+      getStreamer().getContext().getOrCreateSymbol(GlobalName));
+  Symbol->setType(ELF::STT_OBJECT);
+  Symbol->setBinding(ELF::STB_LOCAL);
+}
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal(
+    StringRef GlobalName) {
+
+  MCSymbolELF *Symbol = cast<MCSymbolELF>(
+      getStreamer().getContext().getOrCreateSymbol(GlobalName));
+  Symbol->setType(ELF::STT_OBJECT);
+  Symbol->setBinding(ELF::STB_GLOBAL);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index d37677c..83bb728 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -7,6 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
+
 #include "AMDKernelCodeT.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
@@ -27,6 +30,12 @@ public:
                                              StringRef ArchName) = 0;
 
   virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) = 0;
+
+  virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0;
+
+  virtual void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) = 0;
+
+  virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0;
 };
 
 class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer {
@@ -41,6 +50,12 @@ public:
                                      StringRef ArchName) override;
 
   void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
+
+  void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+
+  void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
+
+  void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
 };
 
 class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer {
@@ -72,6 +87,12 @@ public:
 
   void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
 
+  void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+
+  void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
+
+  void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
 };
 
 }
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index e683498..3c1142d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -37,7 +37,6 @@ class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
   const MCRegisterInfo &MRI;
 
 public:
-
   R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri)
     : MCII(mcii), MRI(mri) { }
 
@@ -50,8 +49,8 @@ public:
   uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const override;
-private:
 
+private:
   void EmitByte(unsigned int byte, raw_ostream &OS) const;
 
   void Emit(uint32_t value, raw_ostream &OS) const;
@@ -59,7 +58,6 @@ private:
 
   unsigned getHWRegChan(unsigned reg) const;
   unsigned getHWReg(unsigned regNo) const;
-
 };
 
 } // End anonymous namespace
@@ -83,7 +81,7 @@ enum FCInstr {
 
 MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
                                              const MCRegisterInfo &MRI,
-					     MCContext &Ctx) {
+                                             MCContext &Ctx) {
   return new R600MCCodeEmitter(MCII, MRI);
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 65a0eeb..9eb3dad 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -36,7 +36,6 @@ class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
   void operator=(const SIMCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   const MCRegisterInfo &MRI;
-  MCContext &Ctx;
 
   /// \brief Can this operand also contain immediate values?
   bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
@@ -47,7 +46,7 @@ class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
 public:
   SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
                   MCContext &ctx)
-    : MCII(mcii), MRI(mri), Ctx(ctx) { }
+    : MCII(mcii), MRI(mri) { }
 
   ~SIMCCodeEmitter() override {}
 
@@ -250,17 +249,7 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
 
   if (MO.isExpr()) {
     const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr());
-    MCFixupKind Kind;
-    const MCSymbol *Sym =
-        Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
-
-    if (&Expr->getSymbol() == Sym) {
-      // Add the offset to the beginning of the constant values.
-      Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text;
-    } else {
-      // This is used for constant data stored in .rodata.
-     Kind = (MCFixupKind)AMDGPU::fixup_si_rodata;
-    }
+    MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_rodata;
     Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc()));
   }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/Processors.td b/contrib/llvm/lib/Target/AMDGPU/Processors.td
index d9a0723..a1584a2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Processors.td
+++ b/contrib/llvm/lib/Target/AMDGPU/Processors.td
@@ -142,3 +142,7 @@ def : ProcessorModel<"carrizo", SIQuarterSpeedModel,
 def : ProcessorModel<"fiji", SIQuarterSpeedModel,
   [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
 >;
+
+def : ProcessorModel<"stoney", SIQuarterSpeedModel,
+  [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
+>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index c8f37f6..bd80bb2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -405,8 +405,8 @@ private:
             if (MO.isReg() && MO.isInternalRead())
               MO.setIsInternalRead(false);
           }
-          getLiteral(BI, Literals);
-          ClauseContent.push_back(BI);
+          getLiteral(&*BI, Literals);
+          ClauseContent.push_back(&*BI);
         }
         I = BI;
         DeleteMI->eraseFromParent();
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 4e4d554..124a9c6 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -190,6 +190,10 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM,
   setSchedulingPreference(Sched::Source);
 }
 
+static inline bool isEOP(MachineBasicBlock::iterator I) {
+  return std::next(I)->getOpcode() == AMDGPU::RETURN;
+}
+
 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
     MachineInstr * MI, MachineBasicBlock * BB) const {
   MachineFunction * MF = BB->getParent();
@@ -276,12 +280,18 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
-    unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
-
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
             .addOperand(MI->getOperand(0))
             .addOperand(MI->getOperand(1))
-            .addImm(EOP); // Set End of program bit
+            .addImm(isEOP(I)); // Set End of program bit
+    break;
+  }
+  case AMDGPU::RAT_STORE_TYPED_eg: {
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1))
+            .addOperand(MI->getOperand(2))
+            .addImm(isEOP(I)); // Set End of program bit
     break;
   }
 
@@ -539,7 +549,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
         }
       }
     }
-    bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
+    bool EOP = isEOP(I);
     if (!EOP && !isLastInstructionOfItsType)
       return BB;
     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
@@ -946,6 +956,8 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDValue Arg = Op.getOperand(0);
   SDLoc DL(Op);
+
+  // TODO: Should this propagate fast-math-flags?
   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
       DAG.getNode(ISD::FADD, DL, VT,
         DAG.getNode(ISD::FMUL, DL, VT, Arg,
@@ -1936,6 +1948,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
             Arg->getOperand(0).getOperand(Element));
       }
     }
+    break;
   }
 
   case ISD::SELECT_CC: {
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index 855fa9f..8b6eea1 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -922,7 +922,7 @@ bool
 R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
                                    unsigned NumCyles,
                                    unsigned ExtraPredCycles,
-                                   const BranchProbability &Probability) const{
+                                   BranchProbability Probability) const{
   return true;
 }
 
@@ -933,14 +933,14 @@ R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
                                    MachineBasicBlock &FMBB,
                                    unsigned NumFCycles,
                                    unsigned ExtraFCycles,
-                                   const BranchProbability &Probability) const {
+                                   BranchProbability Probability) const {
   return true;
 }
 
 bool
 R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
                                          unsigned NumCyles,
-                                         const BranchProbability &Probability)
+                                         BranchProbability Probability)
                                          const {
   return true;
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
index dee4c2b..e7251c3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -174,18 +174,18 @@ namespace llvm {
 
   bool
    isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
-                             const BranchProbability &Probability) const override;
+                             BranchProbability Probability) const override;
 
   bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
                            unsigned ExtraPredCycles,
-                           const BranchProbability &Probability) const override ;
+                           BranchProbability Probability) const override ;
 
   bool
    isProfitableToIfCvt(MachineBasicBlock &TMBB,
                        unsigned NumTCycles, unsigned ExtraTCycles,
                        MachineBasicBlock &FMBB,
                        unsigned NumFCycles, unsigned ExtraFCycles,
-                       const BranchProbability &Probability) const override;
+                       BranchProbability Probability) const override;
 
   bool DefinesPredicate(MachineInstr *MI,
                                   std::vector<MachineOperand> &Pred) const override;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
index 7beed09..33ef6a4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -1655,7 +1655,7 @@ def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>;
 // ISel Patterns
 //===----------------------------------------------------------------------===//
 
-// CND*_INT Pattterns for f32 True / False values
+// CND*_INT Patterns for f32 True / False values
 
 class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat <
   (selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc),
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 0c06ccc..5efb3b9 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -318,7 +318,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
   MRI = &(Fn.getRegInfo());
   for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
        MBB != MBBe; ++MBB) {
-    MachineBasicBlock *MB = MBB;
+    MachineBasicBlock *MB = &*MBB;
     PreviousRegSeq.clear();
     PreviousRegSeqByReg.clear();
     PreviousRegSeqByUndefCount.clear();
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index deee5bc..2126961 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -81,11 +81,11 @@ private:
     int LastDstChan = -1;
     do {
       bool isTrans = false;
-      int BISlot = getSlot(BI);
+      int BISlot = getSlot(&*BI);
       if (LastDstChan >= BISlot)
         isTrans = true;
       LastDstChan = BISlot;
-      if (TII->isPredicated(BI))
+      if (TII->isPredicated(&*BI))
         continue;
       int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
       if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
@@ -95,7 +95,7 @@ private:
         continue;
       }
       unsigned Dst = BI->getOperand(DstIdx).getReg();
-      if (isTrans || TII->isTransOnly(BI)) {
+      if (isTrans || TII->isTransOnly(&*BI)) {
         Result[Dst] = AMDGPU::PS;
         continue;
       }
@@ -149,7 +149,7 @@ private:
 public:
   // Ctor.
   R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI)
-      : VLIWPacketizerList(MF, MLI, true),
+      : VLIWPacketizerList(MF, MLI, nullptr),
         TII(static_cast<const R600InstrInfo *>(
             MF.getSubtarget().getInstrInfo())),
         TRI(TII->getRegisterInfo()) {
@@ -162,14 +162,14 @@ public:
   }
 
   // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
-  bool ignorePseudoInstruction(MachineInstr *MI,
-                               MachineBasicBlock *MBB) override {
+  bool ignorePseudoInstruction(const MachineInstr *MI,
+                               const MachineBasicBlock *MBB) override {
     return false;
   }
 
   // isSoloInstruction - return true if instruction MI can not be packetized
   // with any other instruction, which means that MI itself is a packet.
-  bool isSoloInstruction(MachineInstr *MI) override {
+  bool isSoloInstruction(const MachineInstr *MI) override {
     if (TII->isVector(*MI))
       return true;
     if (!TII->isALUInstr(MI->getOpcode()))
@@ -375,7 +375,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
       // instruction stream until we find the nearest boundary.
       MachineBasicBlock::iterator I = RegionEnd;
       for(;I != MBB->begin(); --I, --RemainingCount) {
-        if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn))
+        if (TII->isSchedulingBoundary(&*std::prev(I), &*MBB, Fn))
           break;
       }
       I = MBB->begin();
@@ -392,7 +392,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
         continue;
       }
 
-      Packetizer.PacketizeMIs(MBB, I, RegionEnd);
+      Packetizer.PacketizeMIs(&*MBB, &*I, RegionEnd);
       RegionEnd = I;
     }
   }
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
index 9713e60..4f8a129 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -35,7 +35,7 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
 
   /// \brief get the register class of the specified type to use in the
   /// CFGStructurizer
-  const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
+  const TargetRegisterClass *getCFGStructurizerRegClass(MVT VT) const;
 
   const RegClassWeight &
     getRegClassWeight(const TargetRegisterClass *RC) const override;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index ccfbf1b..fa4d24a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -312,11 +312,10 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
       if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end())
         Preds.push_back(*PI);
     }
-    BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", nullptr, DT,
-                                      LI, false);
+    BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false);
   }
 
-  CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt());
+  CallInst::Create(EndCf, popSaved(), "", &*BB->getFirstInsertionPt());
 }
 
 /// \brief Annotate the control flow with intrinsics so the backend can
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
index 4c32639..7f79dd3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -37,7 +37,8 @@ enum {
   MIMG = 1 << 18,
   FLAT = 1 << 19,
   WQM = 1 << 20,
-  VGPRSpill = 1 << 21
+  VGPRSpill = 1 << 21,
+  VOPAsmPrefer32Bit = 1 << 22
 };
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
index 5fe8d19..636750d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
@@ -16,15 +16,9 @@
 
 #include "AMDGPU.h"
 #include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 23502b4..96e37c5 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -82,22 +82,10 @@ using namespace llvm;
 namespace {
 
 class SIFixSGPRCopies : public MachineFunctionPass {
-
-private:
+public:
   static char ID;
-  const TargetRegisterClass *inferRegClassFromUses(const SIRegisterInfo *TRI,
-                                           const MachineRegisterInfo &MRI,
-                                           unsigned Reg,
-                                           unsigned SubReg) const;
-  const TargetRegisterClass *inferRegClassFromDef(const SIRegisterInfo *TRI,
-                                                 const MachineRegisterInfo &MRI,
-                                                 unsigned Reg,
-                                                 unsigned SubReg) const;
-  bool isVGPRToSGPRCopy(const MachineInstr &Copy, const SIRegisterInfo *TRI,
-                        const MachineRegisterInfo &MRI) const;
 
-public:
-  SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { }
+  SIFixSGPRCopies() : MachineFunctionPass(ID) { }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -105,14 +93,23 @@ public:
     return "SI Fix SGPR copies";
   }
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
 };
 
 } // End anonymous namespace
 
+INITIALIZE_PASS(SIFixSGPRCopies, DEBUG_TYPE,
+                "SI Fix SGPR copies", false, false)
+
 char SIFixSGPRCopies::ID = 0;
 
-FunctionPass *llvm::createSIFixSGPRCopiesPass(TargetMachine &tm) {
-  return new SIFixSGPRCopies(tm);
+char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID;
+
+FunctionPass *llvm::createSIFixSGPRCopiesPass() {
+  return new SIFixSGPRCopies();
 }
 
 static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
@@ -128,77 +125,115 @@ static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
   return false;
 }
 
-/// This functions walks the use list of Reg until it finds an Instruction
-/// that isn't a COPY returns the register class of that instruction.
-/// \return The register defined by the first non-COPY instruction.
-const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses(
-                                                 const SIRegisterInfo *TRI,
-                                                 const MachineRegisterInfo &MRI,
-                                                 unsigned Reg,
-                                                 unsigned SubReg) const {
-
-  const TargetRegisterClass *RC
-    = TargetRegisterInfo::isVirtualRegister(Reg) ?
-    MRI.getRegClass(Reg) :
-    TRI->getPhysRegClass(Reg);
-
-  RC = TRI->getSubRegClass(RC, SubReg);
-  for (MachineRegisterInfo::use_instr_iterator
-       I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) {
-    switch (I->getOpcode()) {
-    case AMDGPU::COPY:
-      RC = TRI->getCommonSubClass(RC, inferRegClassFromUses(TRI, MRI,
-                                  I->getOperand(0).getReg(),
-                                  I->getOperand(0).getSubReg()));
-      break;
-    }
-  }
+static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
+getCopyRegClasses(const MachineInstr &Copy,
+                  const SIRegisterInfo &TRI,
+                  const MachineRegisterInfo &MRI) {
+  unsigned DstReg = Copy.getOperand(0).getReg();
+  unsigned SrcReg = Copy.getOperand(1).getReg();
+
+  const TargetRegisterClass *SrcRC =
+    TargetRegisterInfo::isVirtualRegister(SrcReg) ?
+    MRI.getRegClass(SrcReg) :
+    TRI.getPhysRegClass(SrcReg);
 
-  return RC;
+  // We don't really care about the subregister here.
+  // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg());
+
+  const TargetRegisterClass *DstRC =
+    TargetRegisterInfo::isVirtualRegister(DstReg) ?
+    MRI.getRegClass(DstReg) :
+    TRI.getPhysRegClass(DstReg);
+
+  return std::make_pair(SrcRC, DstRC);
 }
 
-const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromDef(
-                                                 const SIRegisterInfo *TRI,
-                                                 const MachineRegisterInfo &MRI,
-                                                 unsigned Reg,
-                                                 unsigned SubReg) const {
-  if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
-    const TargetRegisterClass *RC = TRI->getPhysRegClass(Reg);
-    return TRI->getSubRegClass(RC, SubReg);
-  }
-  MachineInstr *Def = MRI.getVRegDef(Reg);
-  if (Def->getOpcode() != AMDGPU::COPY) {
-    return TRI->getSubRegClass(MRI.getRegClass(Reg), SubReg);
-  }
+static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
+                             const TargetRegisterClass *DstRC,
+                             const SIRegisterInfo &TRI) {
+  return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC);
+}
 
-  return inferRegClassFromDef(TRI, MRI, Def->getOperand(1).getReg(),
-                                   Def->getOperand(1).getSubReg());
+static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
+                             const TargetRegisterClass *DstRC,
+                             const SIRegisterInfo &TRI) {
+  return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
 }
 
-bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
-                                      const SIRegisterInfo *TRI,
-                                      const MachineRegisterInfo &MRI) const {
+// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
+//
+// SGPRx = ...
+// SGPRy = REG_SEQUENCE SGPRx, sub0 ...
+// VGPRz = COPY SGPRy
+//
+// ==>
+//
+// VGPRx = COPY SGPRx
+// VGPRz = REG_SEQUENCE VGPRx, sub0
+//
+// This exposes immediate folding opportunities when materializing 64-bit
+// immediates.
+static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
+                                        const SIRegisterInfo *TRI,
+                                        const SIInstrInfo *TII,
+                                        MachineRegisterInfo &MRI) {
+  assert(MI.isRegSequence());
+
+  unsigned DstReg = MI.getOperand(0).getReg();
+  if (!TRI->isSGPRClass(MRI.getRegClass(DstReg)))
+    return false;
 
-  unsigned DstReg = Copy.getOperand(0).getReg();
-  unsigned SrcReg = Copy.getOperand(1).getReg();
-  unsigned SrcSubReg = Copy.getOperand(1).getSubReg();
+  if (!MRI.hasOneUse(DstReg))
+    return false;
 
-  if (!TargetRegisterInfo::isVirtualRegister(DstReg)) {
-    // If the destination register is a physical register there isn't really
-    // much we can do to fix this.
+  MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg);
+  if (!CopyUse.isCopy())
     return false;
-  }
 
-  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
+  const TargetRegisterClass *SrcRC, *DstRC;
+  std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI);
 
-  const TargetRegisterClass *SrcRC;
+  if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
+    return false;
 
-  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
-      MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass)
+  // TODO: Could have multiple extracts?
+  unsigned SubReg = CopyUse.getOperand(1).getSubReg();
+  if (SubReg != AMDGPU::NoSubRegister)
     return false;
 
-  SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg);
-  return TRI->isSGPRClass(DstRC) && TRI->hasVGPRs(SrcRC);
+  MRI.setRegClass(DstReg, DstRC);
+
+  // SGPRx = ...
+  // SGPRy = REG_SEQUENCE SGPRx, sub0 ...
+  // VGPRz = COPY SGPRy
+
+  // =>
+  // VGPRx = COPY SGPRx
+  // VGPRz = REG_SEQUENCE VGPRx, sub0
+
+  MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());
+
+  for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
+    unsigned SrcReg = MI.getOperand(I).getReg();
+    unsigned SrcSubReg = MI.getOperand(I).getReg();
+
+    const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+    assert(TRI->isSGPRClass(SrcRC) &&
+           "Expected SGPR REG_SEQUENCE to only have SGPR inputs");
+
+    SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg);
+    const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC);
+
+    unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC);
+
+    BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg)
+      .addOperand(MI.getOperand(I));
+
+    MI.getOperand(I).setReg(TmpReg);
+  }
+
+  CopyUse.eraseFromParent();
+  return true;
 }
 
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
@@ -207,40 +242,38 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
       static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
   const SIInstrInfo *TII =
       static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  SmallVector<MachineInstr *, 16> Worklist;
+
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                   BI != BE; ++BI) {
 
     MachineBasicBlock &MBB = *BI;
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-                                                      I != E; ++I) {
+         I != E; ++I) {
       MachineInstr &MI = *I;
-      if (MI.getOpcode() == AMDGPU::COPY && isVGPRToSGPRCopy(MI, TRI, MRI)) {
-        DEBUG(dbgs() << "Fixing VGPR -> SGPR copy:\n");
-        DEBUG(MI.print(dbgs()));
-        TII->moveToVALU(MI);
-
-      }
 
       switch (MI.getOpcode()) {
-      default: continue;
-      case AMDGPU::PHI: {
-        DEBUG(dbgs() << "Fixing PHI: " << MI);
-
-        for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
-          const MachineOperand &Op = MI.getOperand(i);
-          unsigned Reg = Op.getReg();
-          const TargetRegisterClass *RC
-            = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg());
+      default:
+        continue;
+      case AMDGPU::COPY: {
+        // If the destination register is a physical register there isn't really
+        // much we can do to fix this.
+        if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))
+          continue;
 
-          MRI.constrainRegClass(Op.getReg(), RC);
-        }
-        unsigned Reg = MI.getOperand(0).getReg();
-        const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg,
-                                                  MI.getOperand(0).getSubReg());
-        if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) {
-          MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass);
+        const TargetRegisterClass *SrcRC, *DstRC;
+        std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
+        if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
+          DEBUG(dbgs() << "Fixing VGPR -> SGPR copy: " << MI);
+          TII->moveToVALU(MI);
         }
 
+        break;
+      }
+      case AMDGPU::PHI: {
+        DEBUG(dbgs() << "Fixing PHI: " << MI);
+        unsigned Reg = MI.getOperand(0).getReg();
         if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
           break;
 
@@ -310,8 +343,10 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
       }
       case AMDGPU::REG_SEQUENCE: {
         if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
-            !hasVGPROperands(MI, TRI))
+            !hasVGPROperands(MI, TRI)) {
+          foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
           continue;
+        }
 
         DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp
index 0c54446..8bda283 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp
@@ -7,9 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-/// \file
-/// SALU instructions ignore control flow, so we need to modify the live ranges
-/// of the registers they define in some cases.
+/// \file SALU instructions ignore the execution mask, so we need to modify the
+/// live ranges of the registers they define in some cases.
 ///
 /// The main case we need to handle is when a def is used in one side of a
 /// branch and not another.  For example:
@@ -42,13 +41,15 @@
 /// ENDIF
 /// %use
 ///
-/// Adding this use will make the def live thoughout the IF branch, which is
+/// Adding this use will make the def live throughout the IF branch, which is
 /// what we want.
 
 #include "AMDGPU.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
@@ -79,9 +80,13 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<LiveIntervals>();
+    AU.addRequired<LiveVariables>();
+    AU.addPreserved<LiveVariables>();
+
     AU.addRequired<MachinePostDominatorTree>();
+    AU.addPreserved<MachinePostDominatorTree>();
     AU.setPreservesCFG();
+
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -90,7 +95,7 @@ public:
 
 INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE,
                       "SI Fix SGPR Live Ranges", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(LiveVariables)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE,
                     "SI Fix SGPR Live Ranges", false, false)
@@ -108,40 +113,48 @@ bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) {
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
-  LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
- MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>();
-  std::vector<std::pair<unsigned, LiveRange *>> SGPRLiveRanges;
+  bool MadeChange = false;
+
+  MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>();
+  SmallVector<unsigned, 16> SGPRLiveRanges;
+
+  LiveVariables *LV = &getAnalysis<LiveVariables>();
+  MachineBasicBlock *Entry = &MF.front();
 
-  // First pass, collect all live intervals for SGPRs
-  for (const MachineBasicBlock &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
+  // Use a depth first order so that in SSA, we encounter all defs before
+  // uses. Once the defs of the block have been found, attempt to insert
+  // SGPR_USE instructions in successor blocks if required.
+  for (MachineBasicBlock *MBB : depth_first(Entry)) {
+    for (const MachineInstr &MI : *MBB) {
       for (const MachineOperand &MO : MI.defs()) {
-        if (MO.isImplicit())
-          continue;
+        // We should never see a live out def of a physical register, so we also
+        // do not need to worry about implicit_defs().
         unsigned Def = MO.getReg();
         if (TargetRegisterInfo::isVirtualRegister(Def)) {
-          if (TRI->isSGPRClass(MRI.getRegClass(Def)))
-            SGPRLiveRanges.push_back(
-                std::make_pair(Def, &LIS->getInterval(Def)));
-        } else if (TRI->isSGPRClass(TRI->getPhysRegClass(Def))) {
-            SGPRLiveRanges.push_back(
-                std::make_pair(Def, &LIS->getRegUnit(Def)));
+          if (TRI->isSGPRClass(MRI.getRegClass(Def))) {
+            // Only consider defs that are live outs. We don't care about def /
+            // use within the same block.
+
+            // LiveVariables does not consider registers that are only used in a
+            // phi in a sucessor block as live out, unlike LiveIntervals.
+            //
+            // This is OK because SIFixSGPRCopies replaced any SGPR phis with
+            // VGPRs.
+            if (LV->isLiveOut(Def, *MBB))
+              SGPRLiveRanges.push_back(Def);
+          }
         }
       }
     }
-  }
 
-  // Second pass fix the intervals
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-                                                  BI != BE; ++BI) {
-    MachineBasicBlock &MBB = *BI;
-    if (MBB.succ_size() < 2)
+    if (MBB->succ_size() < 2)
       continue;
 
-    // We have structured control flow, so number of succesors should be two.
-    assert(MBB.succ_size() == 2);
-    MachineBasicBlock *SuccA = *MBB.succ_begin();
-    MachineBasicBlock *SuccB = *(++MBB.succ_begin());
+    // We have structured control flow, so the number of successors should be
+    // two.
+    assert(MBB->succ_size() == 2);
+    MachineBasicBlock *SuccA = *MBB->succ_begin();
+    MachineBasicBlock *SuccB = *(++MBB->succ_begin());
     MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB);
 
     if (!NCD)
@@ -156,37 +169,51 @@ bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) {
       NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(),
                                             *(++NCD->succ_begin()));
     }
-    assert(SuccA && SuccB);
-    for (std::pair<unsigned, LiveRange*> RegLR : SGPRLiveRanges) {
-      unsigned Reg = RegLR.first;
-      LiveRange *LR = RegLR.second;
-
-      // FIXME: We could be smarter here.  If the register is Live-In to
-      // one block, but the other doesn't have any SGPR defs, then there
-      // won't be a conflict.  Also, if the branch decision is based on
-      // a value in an SGPR, then there will be no conflict.
-      bool LiveInToA = LIS->isLiveInToMBB(*LR, SuccA);
-      bool LiveInToB = LIS->isLiveInToMBB(*LR, SuccB);
-
-      if ((!LiveInToA && !LiveInToB) ||
-          (LiveInToA && LiveInToB))
+
+    for (unsigned Reg : SGPRLiveRanges) {
+      // FIXME: We could be smarter here. If the register is Live-In to one
+      // block, but the other doesn't have any SGPR defs, then there won't be a
+      // conflict. Also, if the branch condition is uniform then there will be
+      // no conflict.
+      bool LiveInToA = LV->isLiveIn(Reg, *SuccA);
+      bool LiveInToB = LV->isLiveIn(Reg, *SuccB);
+
+      if (!LiveInToA && !LiveInToB) {
+        DEBUG(dbgs() << PrintReg(Reg, TRI, 0)
+              << " is live into neither successor\n");
         continue;
+      }
+
+      if (LiveInToA && LiveInToB) {
+        DEBUG(dbgs() << PrintReg(Reg, TRI, 0)
+              << " is live into both successors\n");
+        continue;
+      }
 
       // This interval is live in to one successor, but not the other, so
       // we need to update its range so it is live in to both.
-      DEBUG(dbgs() << "Possible SGPR conflict detected " <<  " in " << *LR <<
-                      " BB#" << SuccA->getNumber() << ", BB#" <<
-                      SuccB->getNumber() <<
-                      " with NCD = " << NCD->getNumber() << '\n');
+      DEBUG(dbgs() << "Possible SGPR conflict detected for "
+            << PrintReg(Reg, TRI, 0)
+            << " BB#" << SuccA->getNumber()
+            << ", BB#" << SuccB->getNumber()
+            << " with NCD = BB#" << NCD->getNumber() << '\n');
+
+      assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
+             "Not expecting to extend live range of physreg");
 
       // FIXME: Need to figure out how to update LiveRange here so this pass
       // will be able to preserve LiveInterval analysis.
-      BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(),
-              TII->get(AMDGPU::SGPR_USE))
-              .addReg(Reg, RegState::Implicit);
-      DEBUG(NCD->getFirstNonPHI()->dump());
+      MachineInstr *NCDSGPRUse =
+        BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(),
+                TII->get(AMDGPU::SGPR_USE))
+        .addReg(Reg, RegState::Implicit);
+
+      MadeChange = true;
+      LV->HandleVirtRegUse(Reg, NCD, NCDSGPRUse);
+
+      DEBUG(NCDSGPRUse->dump());
     }
   }
 
-  return false;
+  return MadeChange;
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index c288725..02a3930 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -45,6 +45,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -164,8 +165,8 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
 
     // Operand is not legal, so try to commute the instruction to
     // see if this makes it possible to fold.
-    unsigned CommuteIdx0;
-    unsigned CommuteIdx1;
+    unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
+    unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
     bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1);
 
     if (CanCommute) {
@@ -175,7 +176,16 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
         OpNo = CommuteIdx0;
     }
 
-    if (!CanCommute || !TII->commuteInstruction(MI))
+    // One of operands might be an Imm operand, and OpNo may refer to it after
+    // the call of commuteInstruction() below. Such situations are avoided
+    // here explicitly as OpNo must be a register operand to be a candidate
+    // for memory folding.
+    if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
+                       !MI->getOperand(CommuteIdx1).isReg()))
+      return false;
+
+    if (!CanCommute ||
+        !TII->commuteInstruction(MI, false, CommuteIdx0, CommuteIdx1))
       return false;
 
     if (!TII->isOperandLegal(MI, OpNo, OpToFold))
@@ -186,6 +196,110 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
   return true;
 }
 
+static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
+                        unsigned UseOpIdx,
+                        std::vector<FoldCandidate> &FoldList,
+                        SmallVectorImpl<MachineInstr *> &CopiesToReplace,
+                        const SIInstrInfo *TII, const SIRegisterInfo &TRI,
+                        MachineRegisterInfo &MRI) {
+  const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
+
+  // FIXME: Fold operands with subregs.
+  if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) ||
+      UseOp.isImplicit())) {
+    return;
+  }
+
+  bool FoldingImm = OpToFold.isImm();
+  APInt Imm;
+
+  if (FoldingImm) {
+    unsigned UseReg = UseOp.getReg();
+    const TargetRegisterClass *UseRC
+      = TargetRegisterInfo::isVirtualRegister(UseReg) ?
+      MRI.getRegClass(UseReg) :
+      TRI.getPhysRegClass(UseReg);
+
+    Imm = APInt(64, OpToFold.getImm());
+
+    const MCInstrDesc &FoldDesc = TII->get(OpToFold.getParent()->getOpcode());
+    const TargetRegisterClass *FoldRC =
+        TRI.getRegClass(FoldDesc.OpInfo[0].RegClass);
+
+    // Split 64-bit constants into 32-bits for folding.
+    if (FoldRC->getSize() == 8 && UseOp.getSubReg()) {
+      if (UseRC->getSize() != 8)
+        return;
+
+      if (UseOp.getSubReg() == AMDGPU::sub0) {
+        Imm = Imm.getLoBits(32);
+      } else {
+        assert(UseOp.getSubReg() == AMDGPU::sub1);
+        Imm = Imm.getHiBits(32);
+      }
+    }
+
+    // In order to fold immediates into copies, we need to change the
+    // copy to a MOV.
+    if (UseMI->getOpcode() == AMDGPU::COPY) {
+      unsigned DestReg = UseMI->getOperand(0).getReg();
+      const TargetRegisterClass *DestRC
+        = TargetRegisterInfo::isVirtualRegister(DestReg) ?
+        MRI.getRegClass(DestReg) :
+        TRI.getPhysRegClass(DestReg);
+
+      unsigned MovOp = TII->getMovOpcode(DestRC);
+      if (MovOp == AMDGPU::COPY)
+        return;
+
+      UseMI->setDesc(TII->get(MovOp));
+      CopiesToReplace.push_back(UseMI);
+    }
+  }
+
+  // Special case for REG_SEQUENCE: We can't fold literals into
+  // REG_SEQUENCE instructions, so we have to fold them into the
+  // uses of REG_SEQUENCE.
+  if (UseMI->getOpcode() == AMDGPU::REG_SEQUENCE) {
+    unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
+    unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
+
+    for (MachineRegisterInfo::use_iterator
+         RSUse = MRI.use_begin(RegSeqDstReg),
+         RSE = MRI.use_end(); RSUse != RSE; ++RSUse) {
+
+      MachineInstr *RSUseMI = RSUse->getParent();
+      if (RSUse->getSubReg() != RegSeqDstSubReg)
+        continue;
+
+      foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
+                  CopiesToReplace, TII, TRI, MRI);
+    }
+    return;
+  }
+
+  const MCInstrDesc &UseDesc = UseMI->getDesc();
+
+  // Don't fold into target independent nodes.  Target independent opcodes
+  // don't have defined register classes.
+  if (UseDesc.isVariadic() ||
+      UseDesc.OpInfo[UseOpIdx].RegClass == -1)
+    return;
+
+  if (FoldingImm) {
+    MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
+    tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
+    return;
+  }
+
+  tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
+
+  // FIXME: We could try to change the instruction from 64-bit to 32-bit
+  // to enable more folding opportunites.  The shrink operands pass
+  // already does this.
+  return;
+}
+
 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const SIInstrInfo *TII =
@@ -226,88 +340,36 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
            OpToFold.getSubReg()))
         continue;
 
+
+      // We need mutate the operands of new mov instructions to add implicit
+      // uses of EXEC, but adding them invalidates the use_iterator, so defer
+      // this.
+      SmallVector<MachineInstr *, 4> CopiesToReplace;
+
       std::vector<FoldCandidate> FoldList;
       for (MachineRegisterInfo::use_iterator
            Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();
            Use != E; ++Use) {
 
         MachineInstr *UseMI = Use->getParent();
-        const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo());
 
-        // FIXME: Fold operands with subregs.
-        if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) ||
-            UseOp.isImplicit())) {
-          continue;
-        }
-
-        APInt Imm;
-
-        if (FoldingImm) {
-          unsigned UseReg = UseOp.getReg();
-          const TargetRegisterClass *UseRC
-            = TargetRegisterInfo::isVirtualRegister(UseReg) ?
-            MRI.getRegClass(UseReg) :
-            TRI.getPhysRegClass(UseReg);
-
-          Imm = APInt(64, OpToFold.getImm());
-
-          // Split 64-bit constants into 32-bits for folding.
-          if (UseOp.getSubReg()) {
-            if (UseRC->getSize() != 8)
-              continue;
-
-            if (UseOp.getSubReg() == AMDGPU::sub0) {
-              Imm = Imm.getLoBits(32);
-            } else {
-              assert(UseOp.getSubReg() == AMDGPU::sub1);
-              Imm = Imm.getHiBits(32);
-            }
-          }
-
-          // In order to fold immediates into copies, we need to change the
-          // copy to a MOV.
-          if (UseMI->getOpcode() == AMDGPU::COPY) {
-            unsigned DestReg = UseMI->getOperand(0).getReg();
-            const TargetRegisterClass *DestRC
-              = TargetRegisterInfo::isVirtualRegister(DestReg) ?
-              MRI.getRegClass(DestReg) :
-              TRI.getPhysRegClass(DestReg);
-
-            unsigned MovOp = TII->getMovOpcode(DestRC);
-            if (MovOp == AMDGPU::COPY)
-              continue;
-
-            UseMI->setDesc(TII->get(MovOp));
-          }
-        }
-
-        const MCInstrDesc &UseDesc = UseMI->getDesc();
-
-        // Don't fold into target independent nodes.  Target independent opcodes
-        // don't have defined register classes.
-        if (UseDesc.isVariadic() ||
-            UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1)
-          continue;
-
-        if (FoldingImm) {
-          MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
-          tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII);
-          continue;
-        }
-
-        tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII);
-
-        // FIXME: We could try to change the instruction from 64-bit to 32-bit
-        // to enable more folding opportunites.  The shrink operands pass
-        // already does this.
+        foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
+                    CopiesToReplace, TII, TRI, MRI);
       }
 
+      // Make sure we add EXEC uses to any new v_mov instructions created.
+      for (MachineInstr *Copy : CopiesToReplace)
+        Copy->addImplicitDefUseOperands(MF);
+
       for (FoldCandidate &Fold : FoldList) {
         if (updateOperand(Fold, TRI)) {
           // Clear kill flags.
           if (!Fold.isImm()) {
             assert(Fold.OpToFold && Fold.OpToFold->isReg());
-            Fold.OpToFold->setIsKill(false);
+            // FIXME: Probably shouldn't bother trying to fold if not an
+            // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
+            // copies.
+            MRI.clearKillFlags(Fold.OpToFold->getReg());
           }
           DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
                 Fold.UseOpNo << " of " << *Fold.UseMI << '\n');
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
new file mode 100644
index 0000000..6b3c81c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -0,0 +1,243 @@
+//===----------------------- SIFrameLowering.cpp --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+#include "SIFrameLowering.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+
+using namespace llvm;
+
+
+static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
+                              const MachineFrameInfo *FrameInfo) {
+  if (!FuncInfo->hasSpilledSGPRs())
+    return false;
+
+  if (FuncInfo->hasSpilledVGPRs())
+    return false;
+
+  for (int I = FrameInfo->getObjectIndexBegin(),
+         E = FrameInfo->getObjectIndexEnd(); I != E; ++I) {
+    if (!FrameInfo->isSpillSlotObjectIndex(I))
+      return false;
+  }
+
+  return true;
+}
+
+static ArrayRef<MCPhysReg> getAllSGPR128() {
+  return makeArrayRef(AMDGPU::SReg_128RegClass.begin(),
+                      AMDGPU::SReg_128RegClass.getNumRegs());
+}
+
+static ArrayRef<MCPhysReg> getAllSGPRs() {
+  return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
+                      AMDGPU::SGPR_32RegClass.getNumRegs());
+}
+
+void SIFrameLowering::emitPrologue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  if (!MF.getFrameInfo()->hasStackObjects())
+    return;
+
+  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  // If we only have SGPR spills, we won't actually be using scratch memory
+  // since these spill to VGPRs.
+  //
+  // FIXME: We should be cleaning up these unused SGPR spill frame indices
+  // somewhere.
+  if (hasOnlySGPRSpills(MFI, MF.getFrameInfo()))
+    return;
+
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+
+  // We need to insert initialization of the scratch resource descriptor.
+  unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
+  assert(ScratchRsrcReg != AMDGPU::NoRegister);
+
+  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+  assert(ScratchWaveOffsetReg != AMDGPU::NoRegister);
+
+  unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
+    MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+
+  unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
+  if (ST.isAmdHsaOS()) {
+    PreloadedPrivateBufferReg = TRI->getPreloadedValue(
+      MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+  }
+
+  // If we reserved the original input registers, we don't need to copy to the
+  // reserved registers.
+  if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
+    // We should always reserve these 5 registers at the same time.
+    assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg &&
+           "scratch wave offset and private segment buffer inconsistent");
+    return;
+  }
+
+
+  // We added live-ins during argument lowering, but since they were not used
+  // they were deleted. We're adding the uses now, so add them back.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
+  MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+
+  if (ST.isAmdHsaOS()) {
+    MRI.addLiveIn(PreloadedPrivateBufferReg);
+    MBB.addLiveIn(PreloadedPrivateBufferReg);
+  }
+
+  // We reserved the last registers for this. Shift it down to the end of those
+  // which were actually used.
+  //
+  // FIXME: It might be safer to use a pseudoregister before replacement.
+
+  // FIXME: We should be able to eliminate unused input registers. We only
+  // cannot do this for the resources required for scratch access. For now we
+  // skip over user SGPRs and may leave unused holes.
+
+  // We find the resource first because it has an alignment requirement.
+  if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) {
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+
+    unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
+    // Skip the last 2 elements because the last one is reserved for VCC, and
+    // this is the 2nd to last element already.
+    for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
+      // Pick the first unallocated one. Make sure we don't clobber the other
+      // reserved input we needed.
+      if (!MRI.isPhysRegUsed(Reg)) {
+        assert(MRI.isAllocatable(Reg));
+        MRI.replaceRegWith(ScratchRsrcReg, Reg);
+        ScratchRsrcReg = Reg;
+        MFI->setScratchRSrcReg(ScratchRsrcReg);
+        break;
+      }
+    }
+  }
+
+  if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    // Skip the last 2 elements because the last one is reserved for VCC, and
+    // this is the 2nd to last element already.
+    unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
+    for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) {
+      // Pick the first unallocated SGPR. Be careful not to pick an alias of the
+      // scratch descriptor, since we haven’t added its uses yet.
+      if (!MRI.isPhysRegUsed(Reg)) {
+        assert(MRI.isAllocatable(Reg) &&
+               !TRI->isSubRegisterEq(ScratchRsrcReg, Reg));
+
+        MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
+        ScratchWaveOffsetReg = Reg;
+        MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+        break;
+      }
+    }
+  }
+
+
+  assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
+
+  const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
+  MachineBasicBlock::iterator I = MBB.begin();
+  DebugLoc DL;
+
+  if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
+    // Make sure we emit the copy for the offset first. We may have chosen to copy
+    // the buffer resource into a register that aliases the input offset register.
+    BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg)
+      .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
+  }
+
+  if (ST.isAmdHsaOS()) {
+    // Insert copies from argument register.
+    assert(
+      !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) &&
+      !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg));
+
+    unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+    unsigned Rsrc23 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3);
+
+    unsigned Lo = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub0_sub1);
+    unsigned Hi = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub2_sub3);
+
+    const MCInstrDesc &SMovB64 = TII->get(AMDGPU::S_MOV_B64);
+
+    BuildMI(MBB, I, DL, SMovB64, Rsrc01)
+      .addReg(Lo, RegState::Kill);
+    BuildMI(MBB, I, DL, SMovB64, Rsrc23)
+      .addReg(Hi, RegState::Kill);
+  } else {
+    unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+    unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+    unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+    unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+
+    // Use relocations to get the pointer, and setup the other bits manually.
+    uint64_t Rsrc23 = TII->getScratchRsrcWords23();
+    BuildMI(MBB, I, DL, SMovB32, Rsrc0)
+      .addExternalSymbol("SCRATCH_RSRC_DWORD0")
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+    BuildMI(MBB, I, DL, SMovB32, Rsrc1)
+      .addExternalSymbol("SCRATCH_RSRC_DWORD1")
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+    BuildMI(MBB, I, DL, SMovB32, Rsrc2)
+      .addImm(Rsrc23 & 0xffffffff)
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+    BuildMI(MBB, I, DL, SMovB32, Rsrc3)
+      .addImm(Rsrc23 >> 32)
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+  }
+
+  // Make the register selected live throughout the function.
+  for (MachineBasicBlock &OtherBB : MF) {
+    if (&OtherBB == &MBB)
+      continue;
+
+    OtherBB.addLiveIn(ScratchRsrcReg);
+    OtherBB.addLiveIn(ScratchWaveOffsetReg);
+  }
+}
+
+void SIFrameLowering::processFunctionBeforeFrameFinalized(
+  MachineFunction &MF,
+  RegScavenger *RS) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  if (!MFI->hasStackObjects())
+    return;
+
+  bool MayNeedScavengingEmergencySlot = MFI->hasStackObjects();
+
+  assert((RS || !MayNeedScavengingEmergencySlot) &&
+         "RegScavenger required if spilling");
+
+  if (MayNeedScavengingEmergencySlot) {
+    int ScavengeFI = MFI->CreateSpillStackObject(
+      AMDGPU::SGPR_32RegClass.getSize(),
+      AMDGPU::SGPR_32RegClass.getAlignment());
+    RS->addScavengingFrameIndex(ScavengeFI);
+  }
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h
new file mode 100644
index 0000000..a9152fd
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -0,0 +1,34 @@
+//===--------------------- SIFrameLowering.h --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H
+
+#include "AMDGPUFrameLowering.h"
+
+namespace llvm {
+
+class SIFrameLowering final : public AMDGPUFrameLowering {
+public:
+  SIFrameLowering(StackDirection D, unsigned StackAl, int LAO,
+                  unsigned TransAl = 1) :
+    AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
+  ~SIFrameLowering() override {}
+
+  void emitPrologue(MachineFunction &MF,
+                    MachineBasicBlock &MBB) const override;
+
+  void processFunctionBeforeFrameFinalized(
+    MachineFunction &MF,
+    RegScavenger *RS = nullptr) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c2db9ff..0e043cb 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -20,6 +20,7 @@
 
 #include "SIISelLowering.h"
 #include "AMDGPU.h"
+#include "AMDGPUDiagnosticInfoUnsupported.h"
 #include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
@@ -51,6 +52,9 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
   addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
 
+  addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
+  addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
+
   addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
 
@@ -103,6 +107,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
 
   setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
 
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
@@ -155,13 +160,30 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   for (MVT VT : MVT::fp_valuetypes())
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
 
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
+
   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
 
+
+  setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
+
+  setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
+  setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
+
   setOperationAction(ISD::LOAD, MVT::i1, Custom);
 
+  setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
+
+  setOperationAction(ISD::STORE, MVT::v2i64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
+
+  setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
+
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
@@ -173,9 +195,14 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
   setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
   setOperationAction(ISD::SELECT, MVT::i1, Promote);
 
+  setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
+
+
+  setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
+
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
-  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) {
+  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch(Op) {
       case ISD::LOAD:
@@ -186,6 +213,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
       case ISD::INSERT_VECTOR_ELT:
       case ISD::INSERT_SUBVECTOR:
       case ISD::EXTRACT_SUBVECTOR:
+      case ISD::SCALAR_TO_VECTOR:
         break;
       case ISD::CONCAT_VECTORS:
         setOperationAction(Op, VT, Custom);
@@ -197,6 +225,22 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
     }
   }
 
+  // Most operations are naturally 32-bit vector operations. We only support
+  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
+  for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
+    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
+    AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
+
+    setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
+    AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
+    AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
+  }
+
   if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
@@ -261,6 +305,41 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
   return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
 }
 
+bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
+  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
+  // additionally can do r + r + i with addr64. 32-bit has more addressing
+  // mode options. Depending on the resource constant, it can also do
+  // (i64 r0) + (i32 r1) * (i14 i).
+  //
+  // Private arrays end up using a scratch buffer most of the time, so also
+  // assume those use MUBUF instructions. Scratch loads / stores are currently
+  // implemented as mubuf instructions with offen bit set, so slightly
+  // different than the normal addr64.
+  if (!isUInt<12>(AM.BaseOffs))
+    return false;
+
+  // FIXME: Since we can split immediate into soffset and immediate offset,
+  // would it make sense to allow any immediate?
+
+  switch (AM.Scale) {
+  case 0: // r + i or just i, depending on HasBaseReg.
+    return true;
+  case 1:
+    return true; // We have r + r or r + i.
+  case 2:
+    if (AM.HasBaseReg) {
+      // Reject 2 * r + r.
+      return false;
+    }
+
+    // Allow 2 * r as r + r
+    // Or  2 * r + i is allowed as r + r + i.
+    return true;
+  default: // Don't allow n * r
+    return false;
+  }
+}
+
 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
                                              const AddrMode &AM, Type *Ty,
                                              unsigned AS) const {
@@ -269,7 +348,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
     return false;
 
   switch (AS) {
-  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::GLOBAL_ADDRESS: {
     if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
       // Assume the we will use FLAT for all global memory accesses
       // on VI.
@@ -282,51 +361,51 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
       // because it has never been validated.
       return isLegalFlatAddressingMode(AM);
     }
-    // fall-through
-  case AMDGPUAS::PRIVATE_ADDRESS:
-  case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
-  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: {
-    // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
-    // additionally can do r + r + i with addr64. 32-bit has more addressing
-    // mode options. Depending on the resource constant, it can also do
-    // (i64 r0) + (i32 r1) * (i14 i).
-    //
-    // SMRD instructions have an 8-bit, dword offset.
-    //
-    // Assume nonunifom access, since the address space isn't enough to know
-    // what instruction we will use, and since we don't know if this is a load
-    // or store and scalar stores are only available on VI.
-    //
-    // We also know if we are doing an extload, we can't do a scalar load.
-    //
-    // Private arrays end up using a scratch buffer most of the time, so also
-    // assume those use MUBUF instructions. Scratch loads / stores are currently
-    // implemented as mubuf instructions with offen bit set, so slightly
-    // different than the normal addr64.
-    if (!isUInt<12>(AM.BaseOffs))
-      return false;
 
-    // FIXME: Since we can split immediate into soffset and immediate offset,
-    // would it make sense to allow any immediate?
+    return isLegalMUBUFAddressingMode(AM);
+  }
+  case AMDGPUAS::CONSTANT_ADDRESS: {
+    // If the offset isn't a multiple of 4, it probably isn't going to be
+    // correctly aligned.
+    if (AM.BaseOffs % 4 != 0)
+      return isLegalMUBUFAddressingMode(AM);
+
+    // There are no SMRD extloads, so if we have to do a small type access we
+    // will use a MUBUF load.
+    // FIXME?: We also need to do this if unaligned, but we don't know the
+    // alignment here.
+    if (DL.getTypeStoreSize(Ty) < 4)
+      return isLegalMUBUFAddressingMode(AM);
+
+    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+      // SMRD instructions have an 8-bit, dword offset on SI.
+      if (!isUInt<8>(AM.BaseOffs / 4))
+        return false;
+    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
+      // On CI+, this can also be a 32-bit literal constant offset. If it fits
+      // in 8-bits, it can use a smaller encoding.
+      if (!isUInt<32>(AM.BaseOffs / 4))
+        return false;
+    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      // On VI, these use the SMEM format and the offset is 20-bit in bytes.
+      if (!isUInt<20>(AM.BaseOffs))
+        return false;
+    } else
+      llvm_unreachable("unhandled generation");
 
-    switch (AM.Scale) {
-    case 0: // r + i or just i, depending on HasBaseReg.
+    if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
       return true;
-    case 1:
-      return true; // We have r + r or r + i.
-    case 2:
-      if (AM.HasBaseReg) {
-        // Reject 2 * r + r.
-        return false;
-      }
 
-      // Allow 2 * r as r + r
-      // Or  2 * r + i is allowed as r + r + i.
+    if (AM.Scale == 1 && AM.HasBaseReg)
       return true;
-    default: // Don't allow n * r
-      return false;
-    }
+
+    return false;
   }
+
+  case AMDGPUAS::PRIVATE_ADDRESS:
+  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
+    return isLegalMUBUFAddressingMode(AM);
+
   case AMDGPUAS::LOCAL_ADDRESS:
   case AMDGPUAS::REGION_ADDRESS: {
     // Basic, single offset DS instructions allow a 16-bit unsigned immediate
@@ -374,7 +453,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
     // aligned, 8 byte access in a single operation using ds_read2/write2_b32
     // with adjacent offsets.
-    return Align % 4 == 0;
+    bool AlignedBy4 = (Align % 4 == 0);
+    if (IsFast)
+      *IsFast = AlignedBy4;
+    return AlignedBy4;
   }
 
   // Smaller than dword value must be aligned.
@@ -411,6 +493,32 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
   return MVT::Other;
 }
 
+static bool isFlatGlobalAddrSpace(unsigned AS) {
+  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
+    AS == AMDGPUAS::FLAT_ADDRESS ||
+    AS == AMDGPUAS::CONSTANT_ADDRESS;
+}
+
+bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
+                                           unsigned DestAS) const {
+  return isFlatGlobalAddrSpace(SrcAS) &&  isFlatGlobalAddrSpace(DestAS);
+}
+
+
+bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
+  const MemSDNode *MemNode = cast<MemSDNode>(N);
+  const Value *Ptr = MemNode->getMemOperand()->getValue();
+
+  // UndefValue means this is a load of a kernel input.  These are uniform.
+  // Sometimes LDS instructions have constant pointers
+  if (isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || isa<Constant>(Ptr) ||
+      isa<GlobalValue>(Ptr))
+    return true;
+
+  const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
+  return I && I->getMetadata("amdgpu.uniform");
+}
+
 TargetLoweringBase::LegalizeTypeAction
 SITargetLowering::getPreferredVectorAction(EVT VT) const {
   if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
@@ -426,12 +534,6 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   return TII->isInlineConstant(Imm);
 }
 
-static EVT toIntegerVT(EVT VT) {
-  if (VT.isVector())
-    return VT.changeVectorElementTypeToInteger();
-  return MVT::getIntegerVT(VT.getSizeInBits());
-}
-
 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                          SDLoc SL, SDValue Chain,
                                          unsigned Offset, bool Signed) const {
@@ -439,7 +541,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
   MachineFunction &MF = DAG.getMachineFunction();
   const SIRegisterInfo *TRI =
       static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
-  unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
+  unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
 
   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
 
@@ -455,30 +557,10 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
 
   unsigned Align = DL.getABITypeAlignment(Ty);
 
-  if (VT != MemVT && VT.isFloatingPoint()) {
-    // Do an integer load and convert.
-    // FIXME: This is mostly because load legalization after type legalization
-    // doesn't handle FP extloads.
-    assert(VT.getScalarType() == MVT::f32 &&
-           MemVT.getScalarType() == MVT::f16);
-
-    EVT IVT = toIntegerVT(VT);
-    EVT MemIVT = toIntegerVT(MemVT);
-    SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD,
-                               IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT,
-                               false, // isVolatile
-                               true, // isNonTemporal
-                               true, // isInvariant
-                               Align); // Alignment
-    SDValue Ops[] = {
-      DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load),
-      Load.getValue(1)
-    };
-
-    return DAG.getMergeValues(Ops, SL);
-  }
-
   ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+  if (MemVT.isFloatingPoint())
+    ExtTy = ISD::EXTLOAD;
+
   return DAG.getLoad(ISD::UNINDEXED, ExtTy,
                      VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
                      false, // isVolatile
@@ -497,8 +579,16 @@ SDValue SITargetLowering::LowerFormalArguments(
   MachineFunction &MF = DAG.getMachineFunction();
   FunctionType *FType = MF.getFunction()->getFunctionType();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+
+  if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) {
+    const Function *Fn = MF.getFunction();
+    DiagnosticInfoUnsupported NoGraphicsHSA(*Fn, "non-compute shaders with HSA");
+    DAG.getContext()->diagnose(NoGraphicsHSA);
+    return SDValue();
+  }
 
-  assert(CallConv == CallingConv::C);
+  // FIXME: We currently assume all calling conventions are kernels.
 
   SmallVector<ISD::InputArg, 16> Splits;
   BitVector Skipped(Ins.size());
@@ -513,7 +603,7 @@ SDValue SITargetLowering::LowerFormalArguments(
       assert((PSInputNum <= 15) && "Too many PS inputs!");
 
       if (!Arg.Used) {
-        // We can savely skip PS inputs
+        // We can safely skip PS inputs
         Skipped.set(i);
         ++PSInputNum;
         continue;
@@ -530,7 +620,7 @@ SDValue SITargetLowering::LowerFormalArguments(
 
       // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
       // three or five element vertex only needs three or five registers,
-      // NOT four or eigth.
+      // NOT four or eight.
       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
       unsigned NumElements = ParamType->getVectorNumElements();
 
@@ -556,41 +646,30 @@ SDValue SITargetLowering::LowerFormalArguments(
     CCInfo.AllocateReg(AMDGPU::VGPR1);
   }
 
-  // The pointer to the list of arguments is stored in SGPR0, SGPR1
-	// The pointer to the scratch buffer is stored in SGPR2, SGPR3
-  if (Info->getShaderType() == ShaderType::COMPUTE) {
-    if (Subtarget->isAmdHsaOS())
-      Info->NumUserSGPRs = 2;  // FIXME: Need to support scratch buffers.
-    else
-      Info->NumUserSGPRs = 4;
-
-    unsigned InputPtrReg =
-        TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
-    unsigned InputPtrRegLo =
-        TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0);
-    unsigned InputPtrRegHi =
-        TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1);
-
-    unsigned ScratchPtrReg =
-        TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
-    unsigned ScratchPtrRegLo =
-        TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0);
-    unsigned ScratchPtrRegHi =
-        TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1);
-
-    CCInfo.AllocateReg(InputPtrRegLo);
-    CCInfo.AllocateReg(InputPtrRegHi);
-    CCInfo.AllocateReg(ScratchPtrRegLo);
-    CCInfo.AllocateReg(ScratchPtrRegHi);
-    MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
-    MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass);
-  }
-
   if (Info->getShaderType() == ShaderType::COMPUTE) {
     getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
                             Splits);
   }
 
+  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
+  if (Info->hasPrivateSegmentBuffer()) {
+    unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
+    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
+    CCInfo.AllocateReg(PrivateSegmentBufferReg);
+  }
+
+  if (Info->hasDispatchPtr()) {
+    unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
+    MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
+    CCInfo.AllocateReg(DispatchPtrReg);
+  }
+
+  if (Info->hasKernargSegmentPtr()) {
+    unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
+    MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
+    CCInfo.AllocateReg(InputPtrReg);
+  }
+
   AnalyzeFormalArguments(CCInfo, Splits);
 
   SmallVector<SDValue, 16> Chains;
@@ -617,7 +696,7 @@ SDValue SITargetLowering::LowerFormalArguments(
                                    Offset, Ins[i].Flags.isSExt());
       Chains.push_back(Arg.getValue(1));
 
-      const PointerType *ParamTy =
+      auto *ParamTy =
         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
       if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
           ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
@@ -678,10 +757,113 @@ SDValue SITargetLowering::LowerFormalArguments(
     InVals.push_back(Val);
   }
 
-  if (Info->getShaderType() != ShaderType::COMPUTE) {
-    unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>(
-        AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));
-    Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
+  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
+  // these from the dispatch pointer.
+
+  // Start adding system SGPRs.
+  if (Info->hasWorkGroupIDX()) {
+    unsigned Reg = Info->addWorkGroupIDX();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  } else
+    llvm_unreachable("work group id x is always enabled");
+
+  if (Info->hasWorkGroupIDY()) {
+    unsigned Reg = Info->addWorkGroupIDY();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasWorkGroupIDZ()) {
+    unsigned Reg = Info->addWorkGroupIDZ();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasWorkGroupInfo()) {
+    unsigned Reg = Info->addWorkGroupInfo();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasPrivateSegmentWaveByteOffset()) {
+    // Scratch wave offset passed in system SGPR.
+    unsigned PrivateSegmentWaveByteOffsetReg
+      = Info->addPrivateSegmentWaveByteOffset();
+
+    MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
+    CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
+  }
+
+  // Now that we've figured out where the scratch register inputs are, see if
+  // should reserve the arguments and use them directly.
+
+  bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
+
+  if (ST.isAmdHsaOS()) {
+    // TODO: Assume we will spill without optimizations.
+    if (HasStackObjects) {
+      // If we have stack objects, we unquestionably need the private buffer
+      // resource. For the HSA ABI, this will be the first 4 user SGPR
+      // inputs. We can reserve those and use them directly.
+
+      unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
+        MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+      Info->setScratchRSrcReg(PrivateSegmentBufferReg);
+
+      unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
+        MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+      Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+    } else {
+      unsigned ReservedBufferReg
+        = TRI->reservedPrivateSegmentBufferReg(MF);
+      unsigned ReservedOffsetReg
+        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+
+      // We tentatively reserve the last registers (skipping the last two
+      // which may contain VCC). After register allocation, we'll replace
+      // these with the ones immediately after those which were really
+      // allocated. In the prologue copies will be inserted from the argument
+      // to these reserved registers.
+      Info->setScratchRSrcReg(ReservedBufferReg);
+      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+    }
+  } else {
+    unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
+
+    // Without HSA, relocations are used for the scratch pointer and the
+    // buffer resource setup is always inserted in the prologue. Scratch wave
+    // offset is still in an input SGPR.
+    Info->setScratchRSrcReg(ReservedBufferReg);
+
+    if (HasStackObjects) {
+      unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
+        MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+      Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+    } else {
+      unsigned ReservedOffsetReg
+        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+    }
+  }
+
+  if (Info->hasWorkItemIDX()) {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  } else
+    llvm_unreachable("workitem id x should always be enabled");
+
+  if (Info->hasWorkItemIDY()) {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasWorkItemIDZ()) {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
   }
 
   if (Chains.empty())
@@ -693,27 +875,11 @@ SDValue SITargetLowering::LowerFormalArguments(
 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MachineInstr * MI, MachineBasicBlock * BB) const {
 
-  MachineBasicBlock::iterator I = *MI;
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-
   switch (MI->getOpcode()) {
   default:
     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   case AMDGPU::BRANCH:
     return BB;
-  case AMDGPU::SI_RegisterStorePseudo: {
-    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-    MachineInstrBuilder MIB =
-        BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore),
-                Reg);
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
-      MIB.addOperand(MI->getOperand(i));
-
-    MI->eraseFromParent();
-    break;
-  }
   }
   return BB;
 }
@@ -944,20 +1110,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
   const GlobalValue *GV = GSD->getGlobal();
   MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace());
 
-  SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);
   SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
-
-  SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
-                              DAG.getConstant(0, DL, MVT::i32));
-  SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
-                              DAG.getConstant(1, DL, MVT::i32));
-
-  SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue),
-                           PtrLo, GA);
-  SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue),
-                           PtrHi, DAG.getConstant(0, DL, MVT::i32),
-                           SDValue(Lo.getNode(), 1));
-  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
+  return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA);
 }
 
 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
@@ -977,6 +1131,18 @@ SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
                                                       // a glue result.
 }
 
+SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
+                                                 SDValue Op,
+                                                 MVT VT,
+                                                 unsigned Offset) const {
+  SDLoc SL(Op);
+  SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL,
+                                 DAG.getEntryNode(), Offset, false);
+  // The local size values will have the hi 16-bits as zero.
+  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
+                     DAG.getValueType(VT));
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -988,7 +1154,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   SDLoc DL(Op);
   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 
+  // TODO: Should this propagate fast-math-flags?
+
   switch (IntrinsicID) {
+  case Intrinsic::amdgcn_dispatch_ptr:
+    return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT);
+
   case Intrinsic::r600_read_ngroups_x:
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::NGROUPS_X, false);
@@ -1008,37 +1180,36 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                           SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
   case Intrinsic::r600_read_local_size_x:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::LOCAL_SIZE_X, false);
+    return lowerImplicitZextParam(DAG, Op, MVT::i16,
+                                  SI::KernelInputOffsets::LOCAL_SIZE_X);
   case Intrinsic::r600_read_local_size_y:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::LOCAL_SIZE_Y, false);
+    return lowerImplicitZextParam(DAG, Op, MVT::i16,
+                                  SI::KernelInputOffsets::LOCAL_SIZE_Y);
   case Intrinsic::r600_read_local_size_z:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::LOCAL_SIZE_Z, false);
-
+    return lowerImplicitZextParam(DAG, Op, MVT::i16,
+                                  SI::KernelInputOffsets::LOCAL_SIZE_Z);
   case Intrinsic::AMDGPU_read_workdim:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          getImplicitParameterOffset(MFI, GRID_DIM), false);
-
+    // Really only 2 bits.
+    return lowerImplicitZextParam(DAG, Op, MVT::i8,
+                                  getImplicitParameterOffset(MFI, GRID_DIM));
   case Intrinsic::r600_read_tgid_x:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
   case Intrinsic::r600_read_tgid_y:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
   case Intrinsic::r600_read_tgid_z:
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
   case Intrinsic::r600_read_tidig_x:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
   case Intrinsic::r600_read_tidig_y:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
   case Intrinsic::r600_read_tidig_z:
     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
   case AMDGPUIntrinsic::SI_load_const: {
     SDValue Ops[] = {
       Op.getOperand(1),
@@ -1077,6 +1248,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                        DAG.getConstant(2, DL, MVT::i32), // P0
                        Op.getOperand(1), Op.getOperand(2), Glue);
   }
+  case AMDGPUIntrinsic::SI_packf16:
+    if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
+      return DAG.getUNDEF(MVT::i32);
+    return Op;
   case AMDGPUIntrinsic::SI_fs_interp: {
     SDValue IJ = Op.getOperand(4);
     SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
@@ -1092,6 +1267,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
                              Op.getOperand(1), Op.getOperand(2), Glue);
   }
+  case Intrinsic::amdgcn_interp_p1: {
+    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
+    SDValue Glue = M0.getValue(1);
+    return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3), Glue);
+  }
+  case Intrinsic::amdgcn_interp_p2: {
+    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
+    SDValue Glue = SDValue(M0.getNode(), 1);
+    return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
+                       Glue);
+  }
   default:
     return AMDGPUTargetLowering::LowerOperation(Op, DAG);
   }
@@ -1152,16 +1340,29 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
            "Custom lowering for non-i32 vectors hasn't been implemented.");
     unsigned NumElements = Op.getValueType().getVectorNumElements();
     assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
+
     switch (Load->getAddressSpace()) {
       default: break;
+      case AMDGPUAS::CONSTANT_ADDRESS:
+      if (isMemOpUniform(Load))
+        break;
+        // Non-uniform loads will be selected to MUBUF instructions, so they
+        // have the same legalization requires ments as global and private
+        // loads.
+        //
+        // Fall-through
       case AMDGPUAS::GLOBAL_ADDRESS:
       case AMDGPUAS::PRIVATE_ADDRESS:
+        if (NumElements >= 8)
+          return SplitVectorLoad(Op, DAG);
+
         // v4 loads are supported for private and global memory.
         if (NumElements <= 4)
           break;
         // fall-through
       case AMDGPUAS::LOCAL_ADDRESS:
-        return ScalarizeVectorLoad(Op, DAG);
+        // If properly aligned, if we split we might be able to use ds_read_b64.
+        return SplitVectorLoad(Op, DAG);
     }
   }
 
@@ -1236,8 +1437,10 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
   if (Unsafe) {
     // Turn into multiply by the reciprocal.
     // x / y -> x * (1.0 / y)
+    SDNodeFlags Flags;
+    Flags.setUnsafeAlgebra(true);
     SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
-    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip);
+    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags);
   }
 
   return SDValue();
@@ -1274,6 +1477,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
 
+  // TODO: Should this propagate fast-math-flags?
+
   r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
 
   SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
@@ -1379,7 +1584,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     return Ret;
 
   if (VT.isVector() && VT.getVectorNumElements() >= 8)
-      return ScalarizeVectorStore(Op, DAG);
+      return SplitVectorStore(Op, DAG);
 
   if (VT == MVT::i1)
     return DAG.getTruncStore(Store->getChain(), DL,
@@ -1393,6 +1598,7 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue Arg = Op.getOperand(0);
+  // TODO: Should this propagate fast-math-flags?
   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
                                   DAG.getNode(ISD::FMUL, DL, VT, Arg,
                                               DAG.getConstantFP(0.5/M_PI, DL,
@@ -2125,9 +2331,14 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
       static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
 
   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-  TII->legalizeOperands(MI);
 
-  if (TII->isMIMG(MI->getOpcode())) {
+  if (TII->isVOP3(MI->getOpcode())) {
+    // Make sure constant bus requirements are respected.
+    TII->legalizeOperandsVOP3(MRI, MI);
+    return;
+  }
+
+  if (TII->isMIMG(*MI)) {
     unsigned VReg = MI->getOperand(0).getReg();
     unsigned Writemask = MI->getOperand(1).getImm();
     unsigned BitsSet = 0;
@@ -2169,53 +2380,38 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
                                                 SDLoc DL,
                                                 SDValue Ptr) const {
   const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-#if 1
-    // XXX - Workaround for moveToVALU not handling different register class
-    // inserts for REG_SEQUENCE.
-
-    // Build the half of the subregister with the constants.
-    const SDValue Ops0[] = {
-      DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
-      buildSMovImm32(DAG, DL, 0),
-      DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
-      buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
-      DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
-    };
-
-    SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
-                                                  MVT::v2i32, Ops0), 0);
-
-    // Combine the constants and the pointer.
-    const SDValue Ops1[] = {
-      DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
-      Ptr,
-      DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
-      SubRegHi,
-      DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
-    };
+    static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+
+  // Build the half of the subregister with the constants before building the
+  // full 128-bit register. If we are building multiple resource descriptors,
+  // this will allow CSEing of the 2-component register.
+  const SDValue Ops0[] = {
+    DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
+    buildSMovImm32(DAG, DL, 0),
+    DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+    buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
+    DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
+  };
 
-    return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
-#else
-    const SDValue Ops[] = {
-      DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
-      Ptr,
-      DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32),
-      buildSMovImm32(DAG, DL, 0),
-      DAG.getTargetConstant(AMDGPU::sub2, MVT::i32),
-      buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32),
-      DAG.getTargetConstant(AMDGPU::sub3, MVT::i32)
-    };
+  SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+                                                MVT::v2i32, Ops0), 0);
 
-    return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
+  // Combine the constants and the pointer.
+  const SDValue Ops1[] = {
+    DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+    Ptr,
+    DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
+    SubRegHi,
+    DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
+  };
 
-#endif
+  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
 }
 
 /// \brief Return a resource descriptor with the 'Add TID' bit enabled
-///        The TID (Thread ID) is multipled by the stride value (bits [61:48]
-///        of the resource descriptor) to create an offset, which is added to the
-///        resource ponter.
+///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
+///        of the resource descriptor) to create an offset, which is added to
+///        the resource pointer.
 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
                                            SDLoc DL,
                                            SDValue Ptr,
@@ -2248,15 +2444,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
 }
 
-MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG,
-                                                  SDLoc DL,
-                                                  SDValue Ptr) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-
-  return buildRSRC(DAG, DL, Ptr, 0, TII->getScratchRsrcWords23());
-}
-
 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
                                                const TargetRegisterClass *RC,
                                                unsigned Reg, EVT VT) const {
@@ -2274,13 +2461,41 @@ std::pair<unsigned, const TargetRegisterClass *>
 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                                StringRef Constraint,
                                                MVT VT) const {
-  if (Constraint == "r") {
-    switch(VT.SimpleTy) {
-      default: llvm_unreachable("Unhandled type for 'r' inline asm constraint");
-      case MVT::i64:
-        return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
-      case MVT::i32:
+
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 's':
+    case 'r':
+      switch (VT.getSizeInBits()) {
+      default:
+        return std::make_pair(0U, nullptr);
+      case 32:
         return std::make_pair(0U, &AMDGPU::SGPR_32RegClass);
+      case 64:
+        return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
+      case 128:
+        return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
+      case 256:
+        return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
+      }
+
+    case 'v':
+      switch (VT.getSizeInBits()) {
+      default:
+        return std::make_pair(0U, nullptr);
+      case 32:
+        return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
+      case 64:
+        return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
+      case 96:
+        return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
+      case 128:
+        return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
+      case 256:
+        return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
+      case 512:
+        return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
+      }
     }
   }
 
@@ -2301,3 +2516,16 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   }
   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
+
+SITargetLowering::ConstraintType
+SITargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 's':
+    case 'v':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
index d84c32e..e2f8cb1 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -28,6 +28,9 @@ class SITargetLowering : public AMDGPUTargetLowering {
   SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
                              SelectionDAG &DAG) const override;
 
+  SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
+                                 MVT VT, unsigned Offset) const;
+
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
@@ -57,6 +60,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
   SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   bool isLegalFlatAddressingMode(const AddrMode &AM) const;
+  bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
 public:
   SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI);
 
@@ -76,6 +80,9 @@ public:
                           bool MemcpyStrSrc,
                           MachineFunction &MF) const override;
 
+  bool isMemOpUniform(const SDNode *N) const;
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(EVT VT) const override;
 
@@ -112,13 +119,10 @@ public:
                            SDValue Ptr,
                            uint32_t RsrcDword1,
                            uint64_t RsrcDword2And3) const;
-  MachineSDNode *buildScratchRSRC(SelectionDAG &DAG,
-                                  SDLoc DL,
-                                  SDValue Ptr) const;
-
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
+  ConstraintType getConstraintType(StringRef Constraint) const override;
   SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const;
 };
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
index 90a37f1..821aada 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -91,7 +91,8 @@ private:
   bool isOpRelevant(MachineOperand &Op);
 
   /// \brief Get register interval an operand affects.
-  RegInterval getRegInterval(MachineOperand &Op);
+  RegInterval getRegInterval(const TargetRegisterClass *RC,
+                             const MachineOperand &Reg) const;
 
   /// \brief Handle instructions async components
   void pushInstruction(MachineBasicBlock &MBB,
@@ -121,9 +122,13 @@ public:
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   const char *getPassName() const override {
-    return "SI insert wait  instructions";
+    return "SI insert wait instructions";
   }
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
 };
 
 } // End anonymous namespace
@@ -138,9 +143,8 @@ FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
 }
 
 Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
-
-  uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
-  Counters Result;
+  uint64_t TSFlags = MI.getDesc().TSFlags;
+  Counters Result = { { 0, 0, 0 } };
 
   Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
 
@@ -151,15 +155,22 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
   // LGKM may uses larger values
   if (TSFlags & SIInstrFlags::LGKM_CNT) {
 
-    if (TII->isSMRD(MI.getOpcode())) {
-
-      MachineOperand &Op = MI.getOperand(0);
-      assert(Op.isReg() && "First LGKM operand must be a register!");
-
-      unsigned Reg = Op.getReg();
-      unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
-      Result.Named.LGKM = Size > 4 ? 2 : 1;
-
+    if (TII->isSMRD(MI)) {
+
+      if (MI.getNumOperands() != 0) {
+        assert(MI.getOperand(0).isReg() &&
+               "First LGKM operand must be a register!");
+
+        // XXX - What if this is a write into a super register?
+        const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
+        unsigned Size = RC->getSize();
+        Result.Named.LGKM = Size > 4 ? 2 : 1;
+      } else {
+        // s_dcache_inv etc. do not have a a destination register. Assume we
+        // want a wait on these.
+        // XXX - What is the right value?
+        Result.Named.LGKM = 1;
+      }
     } else {
       // DS
       Result.Named.LGKM = 1;
@@ -173,9 +184,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
 }
 
 bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
-
   // Constants are always irrelevant
-  if (!Op.isReg())
+  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
     return false;
 
   // Defines are always relevant
@@ -196,7 +206,7 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
   // operand comes before the value operand and it may have
   // multiple data operands.
 
-  if (TII->isDS(MI.getOpcode())) {
+  if (TII->isDS(MI)) {
     MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
     if (Data && Op.isIdenticalTo(*Data))
       return true;
@@ -224,18 +234,13 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
   return false;
 }
 
-RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
-
-  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
-    return std::make_pair(0, 0);
-
-  unsigned Reg = Op.getReg();
-  unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
-
+RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
+                                          const MachineOperand &Reg) const {
+  unsigned Size = RC->getSize();
   assert(Size >= 4);
 
   RegInterval Result;
-  Result.first = TRI->getEncodingValue(Reg);
+  Result.first = TRI->getEncodingValue(Reg.getReg());
   Result.second = Result.first + Size / 4;
 
   return Result;
@@ -246,10 +251,13 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
 
   // Get the hardware counter increments and sum them up
   Counters Increment = getHwCounts(*I);
+  Counters Limit = ZeroCounts;
   unsigned Sum = 0;
 
   for (unsigned i = 0; i < 3; ++i) {
     LastIssued.Array[i] += Increment.Array[i];
+    if (Increment.Array[i])
+      Limit.Array[i] = LastIssued.Array[i];
     Sum += Increment.Array[i];
   }
 
@@ -261,7 +269,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
 
   if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >=
       AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM
+    // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
     // or SMEM clause, respectively.
     //
     // The temporary workaround is to break the clauses with S_NOP.
@@ -270,7 +278,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
     // and destination registers don't overlap, e.g. this is illegal:
     //   r0 = load r2
     //   r2 = load r0
-    if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) ||
+    if ((LastOpcodeType == SMEM && TII->isSMRD(*I)) ||
         (LastOpcodeType == VMEM && Increment.Named.VM)) {
       // Insert a NOP to break the clause.
       BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
@@ -278,7 +286,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
       LastInstWritesM0 = false;
     }
 
-    if (TII->isSMRD(I->getOpcode()))
+    if (TII->isSMRD(*I))
       LastOpcodeType = SMEM;
     else if (Increment.Named.VM)
       LastOpcodeType = VMEM;
@@ -290,21 +298,21 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
   }
 
   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-
     MachineOperand &Op = I->getOperand(i);
     if (!isOpRelevant(Op))
       continue;
 
-    RegInterval Interval = getRegInterval(Op);
+    const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
+    RegInterval Interval = getRegInterval(RC, Op);
     for (unsigned j = Interval.first; j < Interval.second; ++j) {
 
       // Remember which registers we define
       if (Op.isDef())
-        DefinedRegs[j] = LastIssued;
+        DefinedRegs[j] = Limit;
 
       // and which one we are using
       if (Op.isUse())
-        UsedRegs[j] = LastIssued;
+        UsedRegs[j] = Limit;
     }
   }
 }
@@ -390,12 +398,18 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
   if (MI.getOpcode() == AMDGPU::S_SENDMSG)
     return LastIssued;
 
-  // For each register affected by this
-  // instruction increase the result sequence
+  // For each register affected by this instruction increase the result
+  // sequence.
+  //
+  // TODO: We could probably just look at explicit operands if we removed VCC /
+  // EXEC from SMRD dest reg classes.
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-
     MachineOperand &Op = MI.getOperand(i);
-    RegInterval Interval = getRegInterval(Op);
+    if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
+      continue;
+
+    const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
+    RegInterval Interval = getRegInterval(RC, Op);
     for (unsigned j = Interval.first; j < Interval.second; ++j) {
 
       if (Op.isDef()) {
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 211666a..0e883f6 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -41,6 +41,10 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
   field bits<1> WQM = 0;
   field bits<1> VGPRSpill = 0;
 
+  // This bit tells the assembler to use the 32-bit encoding in case it
+  // is unable to infer the encoding from the operands.
+  field bits<1> VOPAsmPrefer32Bit = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = VM_CNT;
   let TSFlags{1} = EXP_CNT;
@@ -68,10 +72,8 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
   let TSFlags{19} = FLAT;
   let TSFlags{20} = WQM;
   let TSFlags{21} = VGPRSpill;
+  let TSFlags{22} = VOPAsmPrefer32Bit;
 
-  // Most instructions require adjustments after selection to satisfy
-  // operand requirements.
-  let hasPostISelHook = 1;
   let SchedRW = [Write32Bit];
 }
 
@@ -86,7 +88,6 @@ class Enc64 {
 }
 
 class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
-def VOPDstVCC : VOPDstOperand <VCCReg>;
 
 let Uses = [EXEC] in {
 
@@ -101,11 +102,11 @@ class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
 }
 
 class VOPCCommon <dag ins, string asm, list<dag> pattern> :
-    VOPAnyCommon <(outs VOPDstVCC:$dst), ins, asm, pattern> {
+    VOPAnyCommon <(outs), ins, asm, pattern> {
 
-  let DisableEncoding = "$dst";
   let VOPC = 1;
   let Size = 4;
+  let Defs = [VCC];
 }
 
 class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> :
@@ -138,6 +139,11 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
   let isCodeGenOnly = 0;
 
   int Size = 8;
+
+  // Because SGPRs may be allowed if there are multiple operands, we
+  // need a post-isel hook to insert copies in order to avoid
+  // violating constant bus requirements.
+  let hasPostISelHook = 1;
 }
 
 } // End Uses = [EXEC]
@@ -222,6 +228,20 @@ class SMRDe <bits<5> op, bits<1> imm> : Enc32 {
   let Inst{31-27} = 0x18; //encoding
 }
 
+class SMRD_IMMe_ci <bits<5> op> : Enc64 {
+  bits<7> sdst;
+  bits<7> sbase;
+  bits<32> offset;
+
+  let Inst{7-0}   = 0xff;
+  let Inst{8}     = 0;
+  let Inst{14-9}  = sbase{6-1};
+  let Inst{21-15} = sdst;
+  let Inst{26-22} = op;
+  let Inst{31-27} = 0x18; //encoding
+  let Inst{63-32} = offset;
+}
+
 let SchedRW = [WriteSALU] in {
 class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> :
     InstSI<outs, ins, asm, pattern> {
@@ -249,13 +269,13 @@ class SOP2 <dag outs, dag ins, string asm, list<dag> pattern> :
 class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
   InstSI<outs, ins, asm, pattern>, SOPCe <op> {
 
-  let DisableEncoding = "$dst";
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
   let SALU = 1;
   let SOPC = 1;
   let isCodeGenOnly = 0;
+  let Defs = [SCC];
 
   let UseNamedOperandTable = 1;
 }
@@ -598,15 +618,13 @@ class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
 // Vector I/O operations
 //===----------------------------------------------------------------------===//
 
-let Uses = [EXEC] in {
-
 class DS <dag outs, dag ins, string asm, list<dag> pattern> :
     InstSI <outs, ins, asm, pattern> {
 
   let LGKM_CNT = 1;
   let DS = 1;
   let UseNamedOperandTable = 1;
-  let Uses = [M0];
+  let Uses = [M0, EXEC];
 
   // Most instruction load and store data, so set this as the default.
   let mayLoad = 1;
@@ -623,6 +641,7 @@ class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> :
   let VM_CNT = 1;
   let EXP_CNT = 1;
   let MUBUF = 1;
+  let Uses = [EXEC];
 
   let hasSideEffects = 0;
   let UseNamedOperandTable = 1;
@@ -636,6 +655,7 @@ class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> :
   let VM_CNT = 1;
   let EXP_CNT = 1;
   let MTBUF = 1;
+  let Uses = [EXEC];
 
   let hasSideEffects = 0;
   let UseNamedOperandTable = 1;
@@ -665,9 +685,7 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let VM_CNT = 1;
   let EXP_CNT = 1;
   let MIMG = 1;
+  let Uses = [EXEC];
 
   let hasSideEffects = 0; // XXX ????
 }
-
-
-} // End Uses = [EXEC]
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index cfd2c42..a08a5a8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -82,6 +82,7 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
   switch (MI->getOpcode()) {
   case AMDGPU::V_MOV_B32_e32:
   case AMDGPU::V_MOV_B32_e64:
+  case AMDGPU::V_MOV_B64_PSEUDO:
     return true;
   default:
     return false;
@@ -204,7 +205,8 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
                                         unsigned &Offset,
                                         const TargetRegisterInfo *TRI) const {
   unsigned Opc = LdSt->getOpcode();
-  if (isDS(Opc)) {
+
+  if (isDS(*LdSt)) {
     const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
                                                       AMDGPU::OpName::offset);
     if (OffsetImm) {
@@ -254,7 +256,7 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
     return false;
   }
 
-  if (isMUBUF(Opc) || isMTBUF(Opc)) {
+  if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) {
     if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1)
       return false;
 
@@ -270,7 +272,7 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
     return true;
   }
 
-  if (isSMRD(Opc)) {
+  if (isSMRD(*LdSt)) {
     const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
                                                       AMDGPU::OpName::offset);
     if (!OffsetImm)
@@ -289,20 +291,18 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
 bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
                                      MachineInstr *SecondLdSt,
                                      unsigned NumLoads) const {
-  unsigned Opc0 = FirstLdSt->getOpcode();
-  unsigned Opc1 = SecondLdSt->getOpcode();
-
   // TODO: This needs finer tuning
   if (NumLoads > 4)
     return false;
 
-  if (isDS(Opc0) && isDS(Opc1))
+  if (isDS(*FirstLdSt) && isDS(*SecondLdSt))
     return true;
 
-  if (isSMRD(Opc0) && isSMRD(Opc1))
+  if (isSMRD(*FirstLdSt) && isSMRD(*SecondLdSt))
     return true;
 
-  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1)))
+  if ((isMUBUF(*FirstLdSt) || isMTBUF(*FirstLdSt)) &&
+      (isMUBUF(*SecondLdSt) || isMTBUF(*SecondLdSt)))
     return true;
 
   return false;
@@ -323,28 +323,45 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
     AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
-    AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0
+    AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
+  };
+
+  static const int16_t Sub0_15_64[] = {
+    AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
+    AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
+    AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
+    AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
   };
 
   static const int16_t Sub0_7[] = {
     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
-    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0
+    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+  };
+
+  static const int16_t Sub0_7_64[] = {
+    AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
+    AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
   };
 
   static const int16_t Sub0_3[] = {
-    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+  };
+
+  static const int16_t Sub0_3_64[] = {
+    AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
   };
 
   static const int16_t Sub0_2[] = {
-    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2,
   };
 
   static const int16_t Sub0_1[] = {
-    AMDGPU::sub0, AMDGPU::sub1, 0
+    AMDGPU::sub0, AMDGPU::sub1,
   };
 
   unsigned Opcode;
-  const int16_t *SubIndices;
+  ArrayRef<int16_t> SubIndices;
+  bool Forward;
 
   if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
@@ -360,7 +377,7 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       } else {
         // FIXME: Hack until VReg_1 removed.
         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
-        BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC)
+        BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32))
           .addImm(0)
           .addReg(SrcReg, getKillRegState(KillSrc));
       }
@@ -375,18 +392,18 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) {
     assert(AMDGPU::SReg_128RegClass.contains(SrcReg));
-    Opcode = AMDGPU::S_MOV_B32;
-    SubIndices = Sub0_3;
+    Opcode = AMDGPU::S_MOV_B64;
+    SubIndices = Sub0_3_64;
 
   } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) {
     assert(AMDGPU::SReg_256RegClass.contains(SrcReg));
-    Opcode = AMDGPU::S_MOV_B32;
-    SubIndices = Sub0_7;
+    Opcode = AMDGPU::S_MOV_B64;
+    SubIndices = Sub0_7_64;
 
   } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) {
     assert(AMDGPU::SReg_512RegClass.contains(SrcReg));
-    Opcode = AMDGPU::S_MOV_B32;
-    SubIndices = Sub0_15;
+    Opcode = AMDGPU::S_MOV_B64;
+    SubIndices = Sub0_15_64;
 
   } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) {
     assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
@@ -428,13 +445,27 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     llvm_unreachable("Can't copy register!");
   }
 
-  while (unsigned SubIdx = *SubIndices++) {
+  if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg))
+    Forward = true;
+  else
+    Forward = false;
+
+  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
+    unsigned SubIdx;
+    if (Forward)
+      SubIdx = SubIndices[Idx];
+    else
+      SubIdx = SubIndices[SubIndices.size() - Idx - 1];
+
     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
       get(Opcode), RI.getSubReg(DestReg, SubIdx));
 
-    Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc));
+    Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
+
+    if (Idx == SubIndices.size() - 1)
+      Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit);
 
-    if (*SubIndices)
+    if (Idx == 0)
       Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
   }
 }
@@ -471,6 +502,40 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
   return AMDGPU::COPY;
 }
 
+static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_S32_SAVE;
+  case 8:
+    return AMDGPU::SI_SPILL_S64_SAVE;
+  case 16:
+    return AMDGPU::SI_SPILL_S128_SAVE;
+  case 32:
+    return AMDGPU::SI_SPILL_S256_SAVE;
+  case 64:
+    return AMDGPU::SI_SPILL_S512_SAVE;
+  default:
+    llvm_unreachable("unknown register size");
+  }
+}
+
+static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_V32_SAVE;
+  case 8:
+    return AMDGPU::SI_SPILL_V64_SAVE;
+  case 16:
+    return AMDGPU::SI_SPILL_V128_SAVE;
+  case 32:
+    return AMDGPU::SI_SPILL_V256_SAVE;
+  case 64:
+    return AMDGPU::SI_SPILL_V512_SAVE;
+  default:
+    llvm_unreachable("unknown register size");
+  }
+}
+
 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator MI,
                                       unsigned SrcReg, bool isKill,
@@ -481,47 +546,83 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo *FrameInfo = MF->getFrameInfo();
   DebugLoc DL = MBB.findDebugLoc(MI);
-  int Opcode = -1;
+
+  unsigned Size = FrameInfo->getObjectSize(FrameIndex);
+  unsigned Align = FrameInfo->getObjectAlignment(FrameIndex);
+  MachinePointerInfo PtrInfo
+    = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
+  MachineMemOperand *MMO
+    = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+                               Size, Align);
 
   if (RI.isSGPRClass(RC)) {
+    MFI->setHasSpilledSGPRs();
+
     // We are only allowed to create one new instruction when spilling
     // registers, so we need to use pseudo instruction for spilling
     // SGPRs.
-    switch (RC->getSize() * 8) {
-      case 32:  Opcode = AMDGPU::SI_SPILL_S32_SAVE;  break;
-      case 64:  Opcode = AMDGPU::SI_SPILL_S64_SAVE;  break;
-      case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break;
-      case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
-      case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
-    }
-  } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
-    MFI->setHasSpilledVGPRs();
-
-    switch(RC->getSize() * 8) {
-      case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break;
-      case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break;
-      case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break;
-      case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break;
-      case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break;
-      case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break;
-    }
+    unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize());
+    BuildMI(MBB, MI, DL, get(Opcode))
+      .addReg(SrcReg)            // src
+      .addFrameIndex(FrameIndex) // frame_idx
+      .addMemOperand(MMO);
+
+    return;
   }
 
-  if (Opcode != -1) {
-    FrameInfo->setObjectAlignment(FrameIndex, 4);
-    BuildMI(MBB, MI, DL, get(Opcode))
-            .addReg(SrcReg)
-            .addFrameIndex(FrameIndex)
-            // Place-holder registers, these will be filled in by
-            // SIPrepareScratchRegs.
-            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-            .addReg(AMDGPU::SGPR0, RegState::Undef);
-  } else {
+  if (!ST.isVGPRSpillingEnabled(MFI)) {
     LLVMContext &Ctx = MF->getFunction()->getContext();
     Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
                   " spill register");
     BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
-            .addReg(SrcReg);
+      .addReg(SrcReg);
+
+    return;
+  }
+
+  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
+
+  unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
+  MFI->setHasSpilledVGPRs();
+  BuildMI(MBB, MI, DL, get(Opcode))
+    .addReg(SrcReg)                   // src
+    .addFrameIndex(FrameIndex)        // frame_idx
+    .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
+    .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+    .addMemOperand(MMO);
+}
+
+static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_S32_RESTORE;
+  case 8:
+    return AMDGPU::SI_SPILL_S64_RESTORE;
+  case 16:
+    return AMDGPU::SI_SPILL_S128_RESTORE;
+  case 32:
+    return AMDGPU::SI_SPILL_S256_RESTORE;
+  case 64:
+    return AMDGPU::SI_SPILL_S512_RESTORE;
+  default:
+    llvm_unreachable("unknown register size");
+  }
+}
+
+static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_V32_RESTORE;
+  case 8:
+    return AMDGPU::SI_SPILL_V64_RESTORE;
+  case 16:
+    return AMDGPU::SI_SPILL_V128_RESTORE;
+  case 32:
+    return AMDGPU::SI_SPILL_V256_RESTORE;
+  case 64:
+    return AMDGPU::SI_SPILL_V512_RESTORE;
+  default:
+    llvm_unreachable("unknown register size");
   }
 }
 
@@ -534,42 +635,43 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo *FrameInfo = MF->getFrameInfo();
   DebugLoc DL = MBB.findDebugLoc(MI);
-  int Opcode = -1;
-
-  if (RI.isSGPRClass(RC)){
-    switch(RC->getSize() * 8) {
-      case 32:  Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break;
-      case 64:  Opcode = AMDGPU::SI_SPILL_S64_RESTORE;  break;
-      case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
-      case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
-      case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
-    }
-  } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
-    switch(RC->getSize() * 8) {
-      case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break;
-      case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break;
-      case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break;
-      case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break;
-      case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break;
-      case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break;
-    }
-  }
+  unsigned Align = FrameInfo->getObjectAlignment(FrameIndex);
+  unsigned Size = FrameInfo->getObjectSize(FrameIndex);
+
+  MachinePointerInfo PtrInfo
+    = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
 
-  if (Opcode != -1) {
-    FrameInfo->setObjectAlignment(FrameIndex, 4);
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+    PtrInfo, MachineMemOperand::MOLoad, Size, Align);
+
+  if (RI.isSGPRClass(RC)) {
+    // FIXME: Maybe this should not include a memoperand because it will be
+    // lowered to non-memory instructions.
+    unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize());
     BuildMI(MBB, MI, DL, get(Opcode), DestReg)
-            .addFrameIndex(FrameIndex)
-            // Place-holder registers, these will be filled in by
-            // SIPrepareScratchRegs.
-            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-            .addReg(AMDGPU::SGPR0, RegState::Undef);
+      .addFrameIndex(FrameIndex) // frame_idx
+      .addMemOperand(MMO);
 
-  } else {
+    return;
+  }
+
+  if (!ST.isVGPRSpillingEnabled(MFI)) {
     LLVMContext &Ctx = MF->getFunction()->getContext();
     Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
                   " restore register");
     BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
+
+    return;
   }
+
+  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
+
+  unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
+  BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+    .addFrameIndex(FrameIndex)        // frame_idx
+    .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
+    .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+    .addMemOperand(MMO);
 }
 
 /// \param @Offset Offset in bytes of the FrameIndex being spilled
@@ -601,17 +703,21 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
     if (MFI->getShaderType() == ShaderType::COMPUTE &&
         WorkGroupSize > WavefrontSize) {
 
-      unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X);
-      unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y);
-      unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z);
+      unsigned TIDIGXReg
+        = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X);
+      unsigned TIDIGYReg
+        = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y);
+      unsigned TIDIGZReg
+        = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z);
       unsigned InputPtrReg =
-          TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR);
+          TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
       for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
         if (!Entry.isLiveIn(Reg))
           Entry.addLiveIn(Reg);
       }
 
       RS->enterBasicBlock(&Entry);
+      // FIXME: Can we scavenge an SReg_64 and access the subregs?
       unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
       unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
@@ -667,8 +773,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
   return TmpReg;
 }
 
-void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI,
-                             int Count) const {
+void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI,
+                                   int Count) const {
   while (Count > 0) {
     int Arg;
     if (Count >= 8)
@@ -687,26 +793,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   switch (MI->getOpcode()) {
   default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
 
-  case AMDGPU::SI_CONSTDATA_PTR: {
-    unsigned Reg = MI->getOperand(0).getReg();
-    unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
-    unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
-
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg);
-
-    // Add 32-bit offset from this instruction to the start of the constant data.
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo)
-            .addReg(RegLo)
-            .addTargetIndex(AMDGPU::TI_CONSTDATA_START)
-            .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit);
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi)
-            .addReg(RegHi)
-            .addImm(0)
-            .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit)
-            .addReg(AMDGPU::SCC, RegState::Implicit);
-    MI->eraseFromParent();
-    break;
-  }
   case AMDGPU::SGPR_USE:
     // This is just a placeholder for register allocation.
     MI->eraseFromParent();
@@ -760,49 +846,90 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     MI->eraseFromParent();
     break;
   }
+
+  case AMDGPU::SI_CONSTDATA_PTR: {
+    const SIRegisterInfo *TRI =
+        static_cast<const SIRegisterInfo *>(ST.getRegisterInfo());
+    MachineFunction &MF = *MBB.getParent();
+    unsigned Reg = MI->getOperand(0).getReg();
+    unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0);
+    unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1);
+
+    // Create a bundle so these instructions won't be re-ordered by the
+    // post-RA scheduler.
+    MIBundleBuilder Bundler(MBB, MI);
+    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
+
+    // Add 32-bit offset from this instruction to the start of the
+    // constant data.
+    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
+                           .addReg(RegLo)
+                           .addOperand(MI->getOperand(1)));
+    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
+                           .addReg(RegHi)
+                           .addImm(0));
+
+    llvm::finalizeBundle(MBB, Bundler.begin());
+
+    MI->eraseFromParent();
+    break;
+  }
   }
   return true;
 }
 
-MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
-                                              bool NewMI) const {
-
-  if (MI->getNumOperands() < 3)
-    return nullptr;
-
+/// Commutes the operands in the given instruction.
+/// The commutable operands are specified by their indices OpIdx0 and OpIdx1.
+///
+/// Do not call this method for a non-commutable instruction or for
+/// non-commutable pair of operand indices OpIdx0 and OpIdx1.
+/// Even though the instruction is commutable, the method may still
+/// fail to commute the operands, null pointer is returned in such cases.
+MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI,
+                                                  bool NewMI,
+                                                  unsigned OpIdx0,
+                                                  unsigned OpIdx1) const {
   int CommutedOpcode = commuteOpcode(*MI);
   if (CommutedOpcode == -1)
     return nullptr;
 
   int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                                            AMDGPU::OpName::src0);
-  assert(Src0Idx != -1 && "Should always have src0 operand");
-
   MachineOperand &Src0 = MI->getOperand(Src0Idx);
   if (!Src0.isReg())
     return nullptr;
 
   int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                                            AMDGPU::OpName::src1);
-  if (Src1Idx == -1)
+
+  if ((OpIdx0 != static_cast<unsigned>(Src0Idx) ||
+       OpIdx1 != static_cast<unsigned>(Src1Idx)) &&
+      (OpIdx0 != static_cast<unsigned>(Src1Idx) ||
+       OpIdx1 != static_cast<unsigned>(Src0Idx)))
     return nullptr;
 
   MachineOperand &Src1 = MI->getOperand(Src1Idx);
 
-  // Make sure it's legal to commute operands for VOP2.
-  if (isVOP2(MI->getOpcode()) &&
-      (!isOperandLegal(MI, Src0Idx, &Src1) ||
-       !isOperandLegal(MI, Src1Idx, &Src0))) {
-    return nullptr;
+
+  if (isVOP2(*MI)) {
+    const MCInstrDesc &InstrDesc = MI->getDesc();
+    // For VOP2 instructions, any operand type is valid to use for src0.  Make
+    // sure we can use the src1 as src0.
+    //
+    // We could be stricter here and only allow commuting if there is a reason
+    // to do so. i.e. if both operands are VGPRs there is no real benefit,
+    // although MachineCSE attempts to find matches by commuting.
+    const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+    if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0))
+      return nullptr;
   }
 
   if (!Src1.isReg()) {
     // Allow commuting instructions with Imm operands.
     if (NewMI || !Src1.isImm() ||
-       (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) {
+        (!isVOP2(*MI) && !isVOP3(*MI))) {
       return nullptr;
     }
-
     // Be sure to copy the source modifiers to the right place.
     if (MachineOperand *Src0Mods
           = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
@@ -832,7 +959,7 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
     Src1.ChangeToRegister(Reg, false);
     Src1.setSubReg(SubReg);
   } else {
-    MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
+    MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1);
   }
 
   if (MI)
@@ -845,8 +972,8 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
 // between the true commutable operands, and the base
 // TargetInstrInfo::commuteInstruction uses it.
 bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
-                                        unsigned &SrcOpIdx1,
-                                        unsigned &SrcOpIdx2) const {
+                                        unsigned &SrcOpIdx0,
+                                        unsigned &SrcOpIdx1) const {
   const MCInstrDesc &MCID = MI->getDesc();
   if (!MCID.isCommutable())
     return false;
@@ -857,7 +984,8 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
     return false;
 
   // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on
-  // immediate.
+  // immediate. Also, immediate src0 operand is not handled in
+  // SIInstrInfo::commuteInstruction();
   if (!MI->getOperand(Src0Idx).isReg())
     return false;
 
@@ -865,18 +993,22 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
   if (Src1Idx == -1)
     return false;
 
-  if (!MI->getOperand(Src1Idx).isReg())
-    return false;
-
-  // If any source modifiers are set, the generic instruction commuting won't
-  // understand how to copy the source modifiers.
-  if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) ||
-      hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
+  MachineOperand &Src1 = MI->getOperand(Src1Idx);
+  if (Src1.isImm()) {
+    // SIInstrInfo::commuteInstruction() does support commuting the immediate
+    // operand src1 in 2 and 3 operand instructions.
+    if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))
+      return false;
+  } else if (Src1.isReg()) {
+    // If any source modifiers are set, the generic instruction commuting won't
+    // understand how to copy the source modifiers.
+    if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) ||
+        hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
+      return false;
+  } else
     return false;
 
-  SrcOpIdx1 = Src0Idx;
-  SrcOpIdx2 = Src1Idx;
-  return true;
+  return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
 }
 
 MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB,
@@ -898,11 +1030,6 @@ bool SIInstrInfo::isMov(unsigned Opcode) const {
   }
 }
 
-bool
-SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
-  return RC != &AMDGPU::EXECRegRegClass;
-}
-
 static void removeModOperands(MachineInstr &MI) {
   unsigned Opc = MI.getOpcode();
   int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
@@ -984,9 +1111,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
       }
 
-      UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
-                                                      AMDGPU::OpName::src2));
-      // ChangingToImmediate adds Src2 back to the instruction.
       Src2->ChangeToImmediate(Imm);
 
       removeModOperands(*UseMI);
@@ -1045,18 +1169,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
   return false;
 }
 
-bool
-SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI,
-                                         AliasAnalysis *AA) const {
-  switch(MI->getOpcode()) {
-  default: return AMDGPUInstrInfo::isTriviallyReMaterializable(MI, AA);
-  case AMDGPU::S_MOV_B32:
-  case AMDGPU::S_MOV_B64:
-  case AMDGPU::V_MOV_B32_e32:
-    return MI->getOperand(1).isImm();
-  }
-}
-
 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
                                 int WidthB, int OffsetB) {
   int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
@@ -1088,9 +1200,6 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
                                                   MachineInstr *MIb,
                                                   AliasAnalysis *AA) const {
-  unsigned Opc0 = MIa->getOpcode();
-  unsigned Opc1 = MIb->getOpcode();
-
   assert(MIa && (MIa->mayLoad() || MIa->mayStore()) &&
          "MIa must load from or modify a memory location");
   assert(MIb && (MIb->mayLoad() || MIb->mayStore()) &&
@@ -1105,32 +1214,32 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
 
   // TODO: Should we check the address space from the MachineMemOperand? That
   // would allow us to distinguish objects we know don't alias based on the
-  // underlying addres space, even if it was lowered to a different one,
+  // underlying address space, even if it was lowered to a different one,
   // e.g. private accesses lowered to use MUBUF instructions on a scratch
   // buffer.
-  if (isDS(Opc0)) {
-    if (isDS(Opc1))
+  if (isDS(*MIa)) {
+    if (isDS(*MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
-    return !isFLAT(Opc1);
+    return !isFLAT(*MIb);
   }
 
-  if (isMUBUF(Opc0) || isMTBUF(Opc0)) {
-    if (isMUBUF(Opc1) || isMTBUF(Opc1))
+  if (isMUBUF(*MIa) || isMTBUF(*MIa)) {
+    if (isMUBUF(*MIb) || isMTBUF(*MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
-    return !isFLAT(Opc1) && !isSMRD(Opc1);
+    return !isFLAT(*MIb) && !isSMRD(*MIb);
   }
 
-  if (isSMRD(Opc0)) {
-    if (isSMRD(Opc1))
+  if (isSMRD(*MIa)) {
+    if (isSMRD(*MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
-    return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0);
+    return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa);
   }
 
-  if (isFLAT(Opc0)) {
-    if (isFLAT(Opc1))
+  if (isFLAT(*MIa)) {
+    if (isFLAT(*MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
     return false;
@@ -1319,6 +1428,26 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
   return false;
 }
 
+static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.implicit_operands()) {
+    // We only care about reads.
+    if (MO.isDef())
+      continue;
+
+    switch (MO.getReg()) {
+    case AMDGPU::VCC:
+    case AMDGPU::M0:
+    case AMDGPU::FLAT_SCR:
+      return MO.getReg();
+
+    default:
+      break;
+    }
+  }
+
+  return AMDGPU::NoRegister;
+}
+
 bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
                                     StringRef &ErrInfo) const {
   uint16_t Opcode = MI->getOpcode();
@@ -1335,7 +1464,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
      return false;
   }
 
-  // Make sure the register classes are correct
+  // Make sure the register classes are correct.
   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
     if (MI->getOperand(i).isFPImm()) {
       ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
@@ -1392,14 +1521,17 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
 
 
   // Verify VOP*
-  if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) {
+  if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) {
     // Only look at the true operands. Only a real operand can use the constant
     // bus, and we don't want to check pseudo-operands like the source modifier
     // flags.
     const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
 
     unsigned ConstantBusCount = 0;
-    unsigned SGPRUsed = AMDGPU::NoRegister;
+    unsigned SGPRUsed = findImplicitSGPRRead(*MI);
+    if (SGPRUsed != AMDGPU::NoRegister)
+      ++ConstantBusCount;
+
     for (int OpIdx : OpIndices) {
       if (OpIdx == -1)
         break;
@@ -1435,6 +1567,16 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
     }
   }
 
+  // Make sure we aren't losing exec uses in the td files. This mostly requires
+  // being careful when using let Uses to try to add other use registers.
+  if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) {
+    const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC);
+    if (!Exec || !Exec->isImplicit()) {
+      ErrInfo = "VALU instruction does not implicitly read exec mask";
+      return false;
+    }
+  }
+
   return true;
 }
 
@@ -1483,11 +1625,17 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
   case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
   case AMDGPU::S_LOAD_DWORD_IMM:
-  case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
+  case AMDGPU::S_LOAD_DWORD_SGPR:
+  case AMDGPU::S_LOAD_DWORD_IMM_ci:
+    return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
   case AMDGPU::S_LOAD_DWORDX2_IMM:
-  case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
+  case AMDGPU::S_LOAD_DWORDX2_SGPR:
+  case AMDGPU::S_LOAD_DWORDX2_IMM_ci:
+    return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
   case AMDGPU::S_LOAD_DWORDX4_IMM:
-  case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
+  case AMDGPU::S_LOAD_DWORDX4_SGPR:
+  case AMDGPU::S_LOAD_DWORDX4_IMM_ci:
+    return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
@@ -1562,17 +1710,21 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
                                          unsigned SubIdx,
                                          const TargetRegisterClass *SubRC)
                                          const {
-  assert(SuperReg.isReg());
-
-  unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
+  MachineBasicBlock *MBB = MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
   unsigned SubReg = MRI.createVirtualRegister(SubRC);
 
+  if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
+    BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
+      .addReg(SuperReg.getReg(), 0, SubIdx);
+    return SubReg;
+  }
+
   // Just in case the super register is itself a sub-register, copy it to a new
   // value so we don't need to worry about merging its subreg index with the
   // SubIdx passed to this function. The register coalescer should be able to
   // eliminate this extra copy.
-  MachineBasicBlock *MBB = MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
+  unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
 
   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
     .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
@@ -1605,36 +1757,6 @@ MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
   return MachineOperand::CreateReg(SubReg, false);
 }
 
-unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
-                                    MachineBasicBlock::iterator MI,
-                                    MachineRegisterInfo &MRI,
-                                    const TargetRegisterClass *RC,
-                                    const MachineOperand &Op) const {
-  MachineBasicBlock *MBB = MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-  unsigned LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  unsigned HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-  unsigned Dst = MRI.createVirtualRegister(RC);
-
-  MachineInstr *Lo = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32),
-                             LoDst)
-    .addImm(Op.getImm() & 0xFFFFFFFF);
-  MachineInstr *Hi = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32),
-                             HiDst)
-    .addImm(Op.getImm() >> 32);
-
-  BuildMI(*MBB, MI, DL, get(TargetOpcode::REG_SEQUENCE), Dst)
-    .addReg(LoDst)
-    .addImm(AMDGPU::sub0)
-    .addReg(HiDst)
-    .addImm(AMDGPU::sub1);
-
-  Worklist.push_back(Lo);
-  Worklist.push_back(Hi);
-
-  return Dst;
-}
-
 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
 void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
   assert(Inst->getNumExplicitOperands() == 3);
@@ -1643,6 +1765,41 @@ void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
   Inst->addOperand(Op1);
 }
 
+bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
+                                    const MCOperandInfo &OpInfo,
+                                    const MachineOperand &MO) const {
+  if (!MO.isReg())
+    return false;
+
+  unsigned Reg = MO.getReg();
+  const TargetRegisterClass *RC =
+    TargetRegisterInfo::isVirtualRegister(Reg) ?
+    MRI.getRegClass(Reg) :
+    RI.getPhysRegClass(Reg);
+
+  // In order to be legal, the common sub-class must be equal to the
+  // class of the current operand.  For example:
+  //
+  // v_mov_b32 s0 ; Operand defined as vsrc_32
+  //              ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
+  //
+  // s_sendmsg 0, s0 ; Operand defined as m0reg
+  //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
+
+  return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
+}
+
+bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
+                                     const MCOperandInfo &OpInfo,
+                                     const MachineOperand &MO) const {
+  if (MO.isReg())
+    return isLegalRegOperand(MRI, OpInfo, MO);
+
+  // Handle non-register types that are treated like immediates.
+  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+  return true;
+}
+
 bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
                                  const MachineOperand *MO) const {
   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
@@ -1653,7 +1810,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
   if (!MO)
     MO = &MI->getOperand(OpIdx);
 
-  if (isVALU(InstDesc.Opcode) &&
+  if (isVALU(*MI) &&
       usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
     unsigned SGPRUsed =
         MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister;
@@ -1670,21 +1827,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
 
   if (MO->isReg()) {
     assert(DefinedRC);
-    const TargetRegisterClass *RC =
-        TargetRegisterInfo::isVirtualRegister(MO->getReg()) ?
-            MRI.getRegClass(MO->getReg()) :
-            RI.getPhysRegClass(MO->getReg());
-
-    // In order to be legal, the common sub-class must be equal to the
-    // class of the current operand.  For example:
-    //
-    // v_mov_b32 s0 ; Operand defined as vsrc_32
-    //              ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
-    //
-    // s_sendmsg 0, s0 ; Operand defined as m0reg
-    //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
-
-    return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
+    return isLegalRegOperand(MRI, OpInfo, *MO);
   }
 
 
@@ -1699,81 +1842,143 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
   return isImmOperandLegal(MI, OpIdx, *MO);
 }
 
-void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
-  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
+                                       MachineInstr *MI) const {
+  unsigned Opc = MI->getOpcode();
+  const MCInstrDesc &InstrDesc = get(Opc);
 
-  int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                           AMDGPU::OpName::src0);
-  int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                           AMDGPU::OpName::src1);
-  int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                           AMDGPU::OpName::src2);
+  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+  MachineOperand &Src1 = MI->getOperand(Src1Idx);
 
-  // Legalize VOP2
-  if (isVOP2(MI->getOpcode()) && Src1Idx != -1) {
-    // Legalize src0
-    if (!isOperandLegal(MI, Src0Idx))
+  // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
+  // we need to only have one constant bus use.
+  //
+  // Note we do not need to worry about literal constants here. They are
+  // disabled for the operand type for instructions because they will always
+  // violate the one constant bus use rule.
+  bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister;
+  if (HasImplicitSGPR) {
+    int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+    MachineOperand &Src0 = MI->getOperand(Src0Idx);
+
+    if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
       legalizeOpWithMove(MI, Src0Idx);
+  }
 
-    // Legalize src1
-    if (isOperandLegal(MI, Src1Idx))
-      return;
+  // VOP2 src0 instructions support all operand types, so we don't need to check
+  // their legality. If src1 is already legal, we don't need to do anything.
+  if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
+    return;
 
-    // Usually src0 of VOP2 instructions allow more types of inputs
-    // than src1, so try to commute the instruction to decrease our
-    // chances of having to insert a MOV instruction to legalize src1.
-    if (MI->isCommutable()) {
-      if (commuteInstruction(MI))
-        // If we are successful in commuting, then we know MI is legal, so
-        // we are done.
-        return;
-    }
+  // We do not use commuteInstruction here because it is too aggressive and will
+  // commute if it is possible. We only want to commute here if it improves
+  // legality. This can be called a fairly large number of times so don't waste
+  // compile time pointlessly swapping and checking legality again.
+  if (HasImplicitSGPR || !MI->isCommutable()) {
+    legalizeOpWithMove(MI, Src1Idx);
+    return;
+  }
+
+  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+  MachineOperand &Src0 = MI->getOperand(Src0Idx);
 
+  // If src0 can be used as src1, commuting will make the operands legal.
+  // Otherwise we have to give up and insert a move.
+  //
+  // TODO: Other immediate-like operand kinds could be commuted if there was a
+  // MachineOperand::ChangeTo* for them.
+  if ((!Src1.isImm() && !Src1.isReg()) ||
+      !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
     legalizeOpWithMove(MI, Src1Idx);
     return;
   }
 
-  // XXX - Do any VOP3 instructions read VCC?
-  // Legalize VOP3
-  if (isVOP3(MI->getOpcode())) {
-    int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx };
+  int CommutedOpc = commuteOpcode(*MI);
+  if (CommutedOpc == -1) {
+    legalizeOpWithMove(MI, Src1Idx);
+    return;
+  }
 
-    // Find the one SGPR operand we are allowed to use.
-    unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
+  MI->setDesc(get(CommutedOpc));
 
-    for (unsigned i = 0; i < 3; ++i) {
-      int Idx = VOP3Idx[i];
-      if (Idx == -1)
-        break;
-      MachineOperand &MO = MI->getOperand(Idx);
+  unsigned Src0Reg = Src0.getReg();
+  unsigned Src0SubReg = Src0.getSubReg();
+  bool Src0Kill = Src0.isKill();
 
-      if (MO.isReg()) {
-        if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
-          continue; // VGPRs are legal
+  if (Src1.isImm())
+    Src0.ChangeToImmediate(Src1.getImm());
+  else if (Src1.isReg()) {
+    Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
+    Src0.setSubReg(Src1.getSubReg());
+  } else
+    llvm_unreachable("Should only have register or immediate operands");
 
-        assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction");
+  Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
+  Src1.setSubReg(Src0SubReg);
+}
 
-        if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
-          SGPRReg = MO.getReg();
-          // We can use one SGPR in each VOP3 instruction.
-          continue;
-        }
-      } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) {
-        // If it is not a register and not a literal constant, then it must be
-        // an inline constant which is always legal.
-        continue;
-      }
-      // If we make it this far, then the operand is not legal and we must
-      // legalize it.
-      legalizeOpWithMove(MI, Idx);
+// Legalize VOP3 operands. Because all operand types are supported for any
+// operand, and since literal constants are not allowed and should never be
+// seen, we only need to worry about inserting copies if we use multiple SGPR
+// operands.
+void SIInstrInfo::legalizeOperandsVOP3(
+  MachineRegisterInfo &MRI,
+  MachineInstr *MI) const {
+  unsigned Opc = MI->getOpcode();
+
+  int VOP3Idx[3] = {
+    AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
+    AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
+    AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
+  };
+
+  // Find the one SGPR operand we are allowed to use.
+  unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
+
+  for (unsigned i = 0; i < 3; ++i) {
+    int Idx = VOP3Idx[i];
+    if (Idx == -1)
+      break;
+    MachineOperand &MO = MI->getOperand(Idx);
+
+    // We should never see a VOP3 instruction with an illegal immediate operand.
+    if (!MO.isReg())
+      continue;
+
+    if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
+      continue; // VGPRs are legal
+
+    if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
+      SGPRReg = MO.getReg();
+      // We can use one SGPR in each VOP3 instruction.
+      continue;
     }
+
+    // If we make it this far, then the operand is not legal and we must
+    // legalize it.
+    legalizeOpWithMove(MI, Idx);
+  }
+}
+
+void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+
+  // Legalize VOP2
+  if (isVOP2(*MI)) {
+    legalizeOperandsVOP2(MRI, MI);
+    return;
+  }
+
+  // Legalize VOP3
+  if (isVOP3(*MI)) {
+    legalizeOperandsVOP3(MRI, MI);
+    return;
   }
 
   // Legalize REG_SEQUENCE and PHI
   // The register class of the operands much be the same type as the register
   // class of the output.
-  if (MI->getOpcode() == AMDGPU::REG_SEQUENCE ||
-      MI->getOpcode() == AMDGPU::PHI) {
+  if (MI->getOpcode() == AMDGPU::PHI) {
     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
     for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
       if (!MI->getOperand(i).isReg() ||
@@ -1802,26 +2007,53 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
     }
 
     // Update all the operands so they have the same type.
-    for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
-      if (!MI->getOperand(i).isReg() ||
-          !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
+    for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
+      MachineOperand &Op = MI->getOperand(I);
+      if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
         continue;
       unsigned DstReg = MRI.createVirtualRegister(RC);
-      MachineBasicBlock *InsertBB;
-      MachineBasicBlock::iterator Insert;
-      if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
-        InsertBB = MI->getParent();
-        Insert = MI;
-      } else {
-        // MI is a PHI instruction.
-        InsertBB = MI->getOperand(i + 1).getMBB();
-        Insert = InsertBB->getFirstTerminator();
+
+      // MI is a PHI instruction.
+      MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB();
+      MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
+
+      BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg)
+        .addOperand(Op);
+      Op.setReg(DstReg);
+    }
+  }
+
+  // REG_SEQUENCE doesn't really require operand legalization, but if one has a
+  // VGPR dest type and SGPR sources, insert copies so all operands are
+  // VGPRs. This seems to help operand folding / the register coalescer.
+  if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
+    MachineBasicBlock *MBB = MI->getParent();
+    const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0);
+    if (RI.hasVGPRs(DstRC)) {
+      // Update all the operands so they are VGPR register classes. These may
+      // not be the same register class because REG_SEQUENCE supports mixing
+      // subregister index types e.g. sub0_sub1 + sub2 + sub3
+      for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
+        MachineOperand &Op = MI->getOperand(I);
+        if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
+          continue;
+
+        const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
+        const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
+        if (VRC == OpRC)
+          continue;
+
+        unsigned DstReg = MRI.createVirtualRegister(VRC);
+
+        BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg)
+          .addOperand(Op);
+
+        Op.setReg(DstReg);
+        Op.setIsKill();
       }
-      BuildMI(*InsertBB, Insert, MI->getDebugLoc(),
-              get(AMDGPU::COPY), DstReg)
-              .addOperand(MI->getOperand(i));
-      MI->getOperand(i).setReg(DstReg);
     }
+
+    return;
   }
 
   // Legalize INSERT_SUBREG
@@ -1858,15 +2090,10 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
     }
 
     MachineBasicBlock &MBB = *MI->getParent();
-    // Extract the ptr from the resource descriptor.
-
-    // SRsrcPtrLo = srsrc:sub0
-    unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc,
-        &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass);
 
-    // SRsrcPtrHi = srsrc:sub1
-    unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc,
-        &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass);
+    // Extract the ptr from the resource descriptor.
+    unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
+      &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
 
     // Create an empty resource descriptor
     unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@@ -1891,80 +2118,112 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
             .addImm(RsrcDataFormat >> 32);
 
     // NewSRsrc = {Zero64, SRsrcFormat}
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
-            NewSRsrc)
-            .addReg(Zero64)
-            .addImm(AMDGPU::sub0_sub1)
-            .addReg(SRsrcFormatLo)
-            .addImm(AMDGPU::sub2)
-            .addReg(SRsrcFormatHi)
-            .addImm(AMDGPU::sub3);
+    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
+      .addReg(Zero64)
+      .addImm(AMDGPU::sub0_sub1)
+      .addReg(SRsrcFormatLo)
+      .addImm(AMDGPU::sub2)
+      .addReg(SRsrcFormatHi)
+      .addImm(AMDGPU::sub3);
 
     MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
     unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
-    unsigned NewVAddrLo;
-    unsigned NewVAddrHi;
     if (VAddr) {
       // This is already an ADDR64 instruction so we need to add the pointer
       // extracted from the resource descriptor to the current value of VAddr.
-      NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-      NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
-      // NewVaddrLo = SRsrcPtrLo + VAddr:sub0
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32),
-              NewVAddrLo)
-              .addReg(SRsrcPtrLo)
-              .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
-              .addReg(AMDGPU::VCC, RegState::ImplicitDefine);
-
-      // NewVaddrHi = SRsrcPtrHi + VAddr:sub1
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32),
-              NewVAddrHi)
-              .addReg(SRsrcPtrHi)
-              .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
-              .addReg(AMDGPU::VCC, RegState::ImplicitDefine)
-              .addReg(AMDGPU::VCC, RegState::Implicit);
-
+      unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+      // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
+      DebugLoc DL = MI->getDebugLoc();
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
+        .addReg(SRsrcPtr, 0, AMDGPU::sub0)
+        .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
+
+      // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
+        .addReg(SRsrcPtr, 0, AMDGPU::sub1)
+        .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
+
+      // NewVaddr = {NewVaddrHi, NewVaddrLo}
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
+        .addReg(NewVAddrLo)
+        .addImm(AMDGPU::sub0)
+        .addReg(NewVAddrHi)
+        .addImm(AMDGPU::sub1);
     } else {
       // This instructions is the _OFFSET variant, so we need to convert it to
       // ADDR64.
+      assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration()
+             < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+             "FIXME: Need to emit flat atomics here");
+
       MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata);
       MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset);
       MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset);
-
-      // Create the new instruction.
       unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode());
-      MachineInstr *Addr64 =
-          BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
-                  .addOperand(*VData)
-                  .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
-                                              // This will be replaced later
-                                              // with the new value of vaddr.
-                  .addOperand(*SRsrc)
-                  .addOperand(*SOffset)
-                  .addOperand(*Offset)
-                  .addImm(0) // glc
-                  .addImm(0) // slc
-                  .addImm(0); // tfe
+
+      // Atomics rith return have have an additional tied operand and are
+      // missing some of the special bits.
+      MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in);
+      MachineInstr *Addr64;
+
+      if (!VDataIn) {
+        // Regular buffer load / store.
+        MachineInstrBuilder MIB
+          = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
+          .addOperand(*VData)
+          .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
+          // This will be replaced later
+          // with the new value of vaddr.
+          .addOperand(*SRsrc)
+          .addOperand(*SOffset)
+          .addOperand(*Offset);
+
+        // Atomics do not have this operand.
+        if (const MachineOperand *GLC
+            = getNamedOperand(*MI, AMDGPU::OpName::glc)) {
+          MIB.addImm(GLC->getImm());
+        }
+
+        MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc));
+
+        if (const MachineOperand *TFE
+            = getNamedOperand(*MI, AMDGPU::OpName::tfe)) {
+          MIB.addImm(TFE->getImm());
+        }
+
+        MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+        Addr64 = MIB;
+      } else {
+        // Atomics with return.
+        Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
+          .addOperand(*VData)
+          .addOperand(*VDataIn)
+          .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
+          // This will be replaced later
+          // with the new value of vaddr.
+          .addOperand(*SRsrc)
+          .addOperand(*SOffset)
+          .addOperand(*Offset)
+          .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc))
+          .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+      }
 
       MI->removeFromParent();
       MI = Addr64;
 
-      NewVAddrLo = SRsrcPtrLo;
-      NewVAddrHi = SRsrcPtrHi;
+      // NewVaddr = {NewVaddrHi, NewVaddrLo}
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
+        .addReg(SRsrcPtr, 0, AMDGPU::sub0)
+        .addImm(AMDGPU::sub0)
+        .addReg(SRsrcPtr, 0, AMDGPU::sub1)
+        .addImm(AMDGPU::sub1);
+
       VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
       SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc);
     }
 
-    // NewVaddr = {NewVaddrHi, NewVaddrLo}
-    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
-            NewVAddr)
-            .addReg(NewVAddrLo)
-            .addImm(AMDGPU::sub0)
-            .addReg(NewVAddrHi)
-            .addImm(AMDGPU::sub1);
-
-
     // Update the instruction to use NewVaddr
     VAddr->setReg(NewVAddr);
     // Update the instruction to use NewSRsrc
@@ -2028,53 +2287,64 @@ void SIInstrInfo::splitSMRD(MachineInstr *MI,
                   .addOperand(*SOff);
     unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
     BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR)
-            .addOperand(*SOff)
-            .addImm(HalfSize);
-    Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp))
+      .addReg(SOff->getReg(), 0, SOff->getSubReg())
+      .addImm(HalfSize);
+    Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
                   .addReg(SBase->getReg(), getKillRegState(IsKill),
                           SBase->getSubReg())
                   .addReg(OffsetSGPR);
   }
 
   unsigned SubLo, SubHi;
+  const TargetRegisterClass *NewDstRC;
   switch (HalfSize) {
     case 4:
       SubLo = AMDGPU::sub0;
       SubHi = AMDGPU::sub1;
+      NewDstRC = &AMDGPU::VReg_64RegClass;
       break;
     case 8:
       SubLo = AMDGPU::sub0_sub1;
       SubHi = AMDGPU::sub2_sub3;
+      NewDstRC = &AMDGPU::VReg_128RegClass;
       break;
     case 16:
       SubLo = AMDGPU::sub0_sub1_sub2_sub3;
       SubHi = AMDGPU::sub4_sub5_sub6_sub7;
+      NewDstRC = &AMDGPU::VReg_256RegClass;
       break;
     case 32:
       SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
       SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15;
+      NewDstRC = &AMDGPU::VReg_512RegClass;
       break;
     default:
       llvm_unreachable("Unhandled HalfSize");
   }
 
-  BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE))
-          .addOperand(MI->getOperand(0))
-          .addReg(RegLo)
-          .addImm(SubLo)
-          .addReg(RegHi)
-          .addImm(SubHi);
+  unsigned OldDst = MI->getOperand(0).getReg();
+  unsigned NewDst = MRI.createVirtualRegister(NewDstRC);
+
+  MRI.replaceRegWith(OldDst, NewDst);
+
+  BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDst)
+    .addReg(RegLo)
+    .addImm(SubLo)
+    .addReg(RegHi)
+    .addImm(SubHi);
 }
 
-void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const {
+void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI,
+                                 MachineRegisterInfo &MRI,
+                                 SmallVectorImpl<MachineInstr *> &Worklist) const {
   MachineBasicBlock *MBB = MI->getParent();
-  switch (MI->getOpcode()) {
-    case AMDGPU::S_LOAD_DWORD_IMM:
-    case AMDGPU::S_LOAD_DWORD_SGPR:
-    case AMDGPU::S_LOAD_DWORDX2_IMM:
-    case AMDGPU::S_LOAD_DWORDX2_SGPR:
-    case AMDGPU::S_LOAD_DWORDX4_IMM:
-    case AMDGPU::S_LOAD_DWORDX4_SGPR: {
+  int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+  assert(DstIdx != -1);
+  unsigned DstRCID = get(MI->getOpcode()).OpInfo[DstIdx].RegClass;
+  switch(RI.getRegClass(DstRCID)->getSize()) {
+    case 4:
+    case 8:
+    case 16: {
       unsigned NewOpcode = getVALUOp(*MI);
       unsigned RegOffset;
       unsigned ImmOffset;
@@ -2118,53 +2388,55 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con
       BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
               .addImm(RsrcDataFormat >> 32);
       BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
-              .addReg(DWord0)
-              .addImm(AMDGPU::sub0)
-              .addReg(DWord1)
-              .addImm(AMDGPU::sub1)
-              .addReg(DWord2)
-              .addImm(AMDGPU::sub2)
-              .addReg(DWord3)
-              .addImm(AMDGPU::sub3);
-      MI->setDesc(get(NewOpcode));
-      if (MI->getOperand(2).isReg()) {
-        MI->getOperand(2).setReg(SRsrc);
-      } else {
-        MI->getOperand(2).ChangeToRegister(SRsrc, false);
-      }
-      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0));
-      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset));
-      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // glc
-      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // slc
-      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // tfe
-
-      const TargetRegisterClass *NewDstRC =
-          RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass);
-
-      unsigned DstReg = MI->getOperand(0).getReg();
+        .addReg(DWord0)
+        .addImm(AMDGPU::sub0)
+        .addReg(DWord1)
+        .addImm(AMDGPU::sub1)
+        .addReg(DWord2)
+        .addImm(AMDGPU::sub2)
+        .addReg(DWord3)
+        .addImm(AMDGPU::sub3);
+
+      const MCInstrDesc &NewInstDesc = get(NewOpcode);
+      const TargetRegisterClass *NewDstRC
+        = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass);
       unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
+      unsigned DstReg = MI->getOperand(0).getReg();
       MRI.replaceRegWith(DstReg, NewDstReg);
+
+      MachineInstr *NewInst =
+        BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg)
+        .addOperand(MI->getOperand(1)) // sbase
+        .addReg(SRsrc)
+        .addImm(0)
+        .addImm(ImmOffset)
+        .addImm(0) // glc
+        .addImm(0) // slc
+        .addImm(0) // tfe
+        .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+      MI->eraseFromParent();
+
+      legalizeOperands(NewInst);
+      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
       break;
     }
-    case AMDGPU::S_LOAD_DWORDX8_IMM:
-    case AMDGPU::S_LOAD_DWORDX8_SGPR: {
+    case 32: {
       MachineInstr *Lo, *Hi;
       splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM,
                 AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi);
       MI->eraseFromParent();
-      moveSMRDToVALU(Lo, MRI);
-      moveSMRDToVALU(Hi, MRI);
+      moveSMRDToVALU(Lo, MRI, Worklist);
+      moveSMRDToVALU(Hi, MRI, Worklist);
       break;
     }
 
-    case AMDGPU::S_LOAD_DWORDX16_IMM:
-    case AMDGPU::S_LOAD_DWORDX16_SGPR: {
+    case 64: {
       MachineInstr *Lo, *Hi;
       splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM,
                 AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi);
       MI->eraseFromParent();
-      moveSMRDToVALU(Lo, MRI);
-      moveSMRDToVALU(Hi, MRI);
+      moveSMRDToVALU(Lo, MRI, Worklist);
+      moveSMRDToVALU(Hi, MRI, Worklist);
       break;
     }
   }
@@ -2185,51 +2457,28 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     // Handle some special cases
     switch (Opcode) {
     default:
-      if (isSMRD(Inst->getOpcode())) {
-        moveSMRDToVALU(Inst, MRI);
+      if (isSMRD(*Inst)) {
+        moveSMRDToVALU(Inst, MRI, Worklist);
+        continue;
       }
       break;
-    case AMDGPU::S_MOV_B64: {
-      DebugLoc DL = Inst->getDebugLoc();
-
-      // If the source operand is a register we can replace this with a
-      // copy.
-      if (Inst->getOperand(1).isReg()) {
-        MachineInstr *Copy = BuildMI(*MBB, Inst, DL, get(TargetOpcode::COPY))
-          .addOperand(Inst->getOperand(0))
-          .addOperand(Inst->getOperand(1));
-        Worklist.push_back(Copy);
-      } else {
-        // Otherwise, we need to split this into two movs, because there is
-        // no 64-bit VALU move instruction.
-        unsigned Reg = Inst->getOperand(0).getReg();
-        unsigned Dst = split64BitImm(Worklist,
-                                     Inst,
-                                     MRI,
-                                     MRI.getRegClass(Reg),
-                                     Inst->getOperand(1));
-        MRI.replaceRegWith(Reg, Dst);
-      }
-      Inst->eraseFromParent();
-      continue;
-    }
     case AMDGPU::S_AND_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
       Inst->eraseFromParent();
       continue;
 
     case AMDGPU::S_OR_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
       Inst->eraseFromParent();
       continue;
 
     case AMDGPU::S_XOR_B64:
-      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32);
+      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
       Inst->eraseFromParent();
       continue;
 
     case AMDGPU::S_NOT_B64:
-      splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
+      splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
       Inst->eraseFromParent();
       continue;
 
@@ -2281,6 +2530,11 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
       }
       break;
 
+    case AMDGPU::S_ABS_I32:
+      lowerScalarAbs(Worklist, Inst);
+      Inst->eraseFromParent();
+      continue;
+
     case AMDGPU::S_BFE_U64:
     case AMDGPU::S_BFM_B64:
       llvm_unreachable("Moving this op to VALU not implemented");
@@ -2319,7 +2573,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
       Inst->addOperand(MachineOperand::CreateImm(0));
     }
 
-    addDescImplicitUseDef(NewDesc, Inst);
+    Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent());
 
     if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
       const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
@@ -2337,27 +2591,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     }
 
     // Update the destination register class.
-
-    const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);
-
-    switch (Opcode) {
-      // For target instructions, getOpRegClass just returns the virtual
-      // register class associated with the operand, so we need to find an
-      // equivalent VGPR register class in order to move the instruction to the
-      // VALU.
-    case AMDGPU::COPY:
-    case AMDGPU::PHI:
-    case AMDGPU::REG_SEQUENCE:
-    case AMDGPU::INSERT_SUBREG:
-      if (RI.hasVGPRs(NewDstRC))
-        continue;
-      NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
-      if (!NewDstRC)
-        continue;
-      break;
-    default:
-      break;
-    }
+    const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst);
+    if (!NewDstRC)
+      continue;
 
     unsigned DstReg = Inst->getOperand(0).getReg();
     unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
@@ -2366,13 +2602,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     // Legalize the operands
     legalizeOperands(Inst);
 
-    for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
-           E = MRI.use_end(); I != E; ++I) {
-      MachineInstr &UseMI = *I->getParent();
-      if (!canReadVGPR(UseMI, I.getOperandNo())) {
-        Worklist.push_back(&UseMI);
-      }
-    }
+    addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
   }
 }
 
@@ -2390,6 +2620,30 @@ const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
   return &AMDGPU::VGPR_32RegClass;
 }
 
+void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+                                 MachineInstr *Inst) const {
+  MachineBasicBlock &MBB = *Inst->getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineBasicBlock::iterator MII = Inst;
+  DebugLoc DL = Inst->getDebugLoc();
+
+  MachineOperand &Dest = Inst->getOperand(0);
+  MachineOperand &Src = Inst->getOperand(1);
+  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+  BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg)
+    .addImm(0)
+    .addReg(Src.getReg());
+
+  BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
+    .addReg(Src.getReg())
+    .addReg(TmpReg);
+
+  MRI.replaceRegWith(Dest.getReg(), ResultReg);
+  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
 void SIInstrInfo::splitScalar64BitUnaryOp(
   SmallVectorImpl<MachineInstr *> &Worklist,
   MachineInstr *Inst,
@@ -2414,20 +2668,21 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
                                                        AMDGPU::sub0, Src0SubRC);
 
   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
-  const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
+  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
+  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
 
-  unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
-  MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
+  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
+  BuildMI(MBB, MII, DL, InstDesc, DestSub0)
     .addOperand(SrcReg0Sub0);
 
   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
                                                        AMDGPU::sub1, Src0SubRC);
 
-  unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
-  MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
+  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
+  BuildMI(MBB, MII, DL, InstDesc, DestSub1)
     .addOperand(SrcReg0Sub1);
 
-  unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
+  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
     .addReg(DestSub0)
     .addImm(AMDGPU::sub0)
@@ -2436,10 +2691,11 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
 
   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
 
-  // Try to legalize the operands in case we need to swap the order to keep it
-  // valid.
-  Worklist.push_back(LoHalf);
-  Worklist.push_back(HiHalf);
+  // We don't need to legalizeOperands here because for a single operand, src0
+  // will support any kind of input.
+
+  // Move all users of this moved value.
+  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
 }
 
 void SIInstrInfo::splitScalar64BitBinaryOp(
@@ -2474,9 +2730,10 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
                                                        AMDGPU::sub0, Src1SubRC);
 
   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
-  const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
+  const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
+  const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
 
-  unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
+  unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
   MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
     .addOperand(SrcReg0Sub0)
     .addOperand(SrcReg1Sub0);
@@ -2486,12 +2743,12 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
                                                        AMDGPU::sub1, Src1SubRC);
 
-  unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
+  unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
   MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
     .addOperand(SrcReg0Sub1)
     .addOperand(SrcReg1Sub1);
 
-  unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
+  unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
     .addReg(DestSub0)
     .addImm(AMDGPU::sub0)
@@ -2502,8 +2759,11 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
 
   // Try to legalize the operands in case we need to swap the order to keep it
   // valid.
-  Worklist.push_back(LoHalf);
-  Worklist.push_back(HiHalf);
+  legalizeOperands(LoHalf);
+  legalizeOperands(HiHalf);
+
+  // Move all users of this moved vlaue.
+  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
 }
 
 void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
@@ -2532,18 +2792,19 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist
   MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
                                                       AMDGPU::sub1, SrcSubRC);
 
-  MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg)
+  BuildMI(MBB, MII, DL, InstDesc, MidReg)
     .addOperand(SrcRegSub0)
     .addImm(0);
 
-  MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg)
+  BuildMI(MBB, MII, DL, InstDesc, ResultReg)
     .addOperand(SrcRegSub1)
     .addReg(MidReg);
 
   MRI.replaceRegWith(Dest.getReg(), ResultReg);
 
-  Worklist.push_back(First);
-  Worklist.push_back(Second);
+  // We don't need to legalize operands here. src0 for etiher instruction can be
+  // an SGPR, and the second input is unused or determined here.
+  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
@@ -2587,6 +2848,7 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
       .addImm(AMDGPU::sub1);
 
     MRI.replaceRegWith(Dest.getReg(), ResultReg);
+    addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
     return;
   }
 
@@ -2605,33 +2867,53 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
     .addImm(AMDGPU::sub1);
 
   MRI.replaceRegWith(Dest.getReg(), ResultReg);
+  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
-void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
-                                        MachineInstr *Inst) const {
-  // Add the implict and explicit register definitions.
-  if (NewDesc.ImplicitUses) {
-    for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) {
-      unsigned Reg = NewDesc.ImplicitUses[i];
-      Inst->addOperand(MachineOperand::CreateReg(Reg, false, true));
+void SIInstrInfo::addUsersToMoveToVALUWorklist(
+  unsigned DstReg,
+  MachineRegisterInfo &MRI,
+  SmallVectorImpl<MachineInstr *> &Worklist) const {
+  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
+         E = MRI.use_end(); I != E; ++I) {
+    MachineInstr &UseMI = *I->getParent();
+    if (!canReadVGPR(UseMI, I.getOperandNo())) {
+      Worklist.push_back(&UseMI);
     }
   }
+}
 
-  if (NewDesc.ImplicitDefs) {
-    for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) {
-      unsigned Reg = NewDesc.ImplicitDefs[i];
-      Inst->addOperand(MachineOperand::CreateReg(Reg, true, true));
-    }
+const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
+  const MachineInstr &Inst) const {
+  const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
+
+  switch (Inst.getOpcode()) {
+  // For target instructions, getOpRegClass just returns the virtual register
+  // class associated with the operand, so we need to find an equivalent VGPR
+  // register class in order to move the instruction to the VALU.
+  case AMDGPU::COPY:
+  case AMDGPU::PHI:
+  case AMDGPU::REG_SEQUENCE:
+  case AMDGPU::INSERT_SUBREG:
+    if (RI.hasVGPRs(NewDstRC))
+      return nullptr;
+
+    NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
+    if (!NewDstRC)
+      return nullptr;
+    return NewDstRC;
+  default:
+    return NewDstRC;
   }
 }
 
+// Find the one SGPR operand we are allowed to use.
 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
                                    int OpIndices[3]) const {
-  const MCInstrDesc &Desc = get(MI->getOpcode());
+  const MCInstrDesc &Desc = MI->getDesc();
 
   // Find the one SGPR operand we are allowed to use.
-  unsigned SGPRReg = AMDGPU::NoRegister;
-
+  //
   // First we need to consider the instruction's operand requirements before
   // legalizing. Some operands are required to be SGPRs, such as implicit uses
   // of VCC, but we are still bound by the constant bus requirement to only use
@@ -2639,17 +2921,9 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
   //
   // If the operand's class is an SGPR, we can never move it.
 
-  for (const MachineOperand &MO : MI->implicit_operands()) {
-    // We only care about reads.
-    if (MO.isDef())
-      continue;
-
-    if (MO.getReg() == AMDGPU::VCC)
-      return AMDGPU::VCC;
-
-    if (MO.getReg() == AMDGPU::FLAT_SCR)
-      return AMDGPU::FLAT_SCR;
-  }
+  unsigned SGPRReg = findImplicitSGPRRead(*MI);
+  if (SGPRReg != AMDGPU::NoRegister)
+    return SGPRReg;
 
   unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
@@ -2660,15 +2934,22 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
       break;
 
     const MachineOperand &MO = MI->getOperand(Idx);
-    if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass))
-      SGPRReg = MO.getReg();
+    if (!MO.isReg())
+      continue;
 
-    if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
-      UsedSGPRs[i] = MO.getReg();
-  }
+    // Is this operand statically required to be an SGPR based on the operand
+    // constraints?
+    const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
+    bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
+    if (IsRequiredSGPR)
+      return MO.getReg();
 
-  if (SGPRReg != AMDGPU::NoRegister)
-    return SGPRReg;
+    // If this could be a VGPR or an SGPR, Check the dynamic register class.
+    unsigned Reg = MO.getReg();
+    const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
+    if (RI.isSGPRClass(RegRC))
+      UsedSGPRs[i] = Reg;
+  }
 
   // We don't have a required SGPR operand, so we have a bit more freedom in
   // selecting operands to move.
@@ -2680,6 +2961,9 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
   // V_FMA_F32 v0, s0, s0, s0 -> No moves
   // V_FMA_F32 v0, s0, s1, s0 -> Move s1
 
+  // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
+  // prefer those.
+
   if (UsedSGPRs[0] != AMDGPU::NoRegister) {
     if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
       SGPRReg = UsedSGPRs[0];
@@ -2720,7 +3004,7 @@ MachineInstrBuilder SIInstrInfo::buildIndirectRead(
   unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
                                       getIndirectIndexBegin(*MBB->getParent()));
 
-  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC))
+  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC_V1))
           .addOperand(I->getOperand(0))
           .addOperand(I->getOperand(1))
           .addReg(IndirectBaseReg)
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 5053786..307ef67 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -39,14 +39,11 @@ private:
                                          unsigned SubIdx,
                                          const TargetRegisterClass *SubRC) const;
 
-  unsigned split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
-                         MachineBasicBlock::iterator MI,
-                         MachineRegisterInfo &MRI,
-                         const TargetRegisterClass *RC,
-                         const MachineOperand &Op) const;
-
   void swapOperands(MachineBasicBlock::iterator Inst) const;
 
+  void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+                      MachineInstr *Inst) const;
+
   void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
                                MachineInstr *Inst, unsigned Opcode) const;
 
@@ -58,13 +55,24 @@ private:
   void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
                            MachineInstr *Inst) const;
 
-  void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
+  void addUsersToMoveToVALUWorklist(
+    unsigned Reg, MachineRegisterInfo &MRI,
+    SmallVectorImpl<MachineInstr *> &Worklist) const;
+
+  const TargetRegisterClass *
+  getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
 
   bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
                                     MachineInstr *MIb) const;
 
   unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const;
 
+protected:
+  MachineInstr *commuteInstructionImpl(MachineInstr *MI,
+                                       bool NewMI,
+                                       unsigned OpIdx0,
+                                       unsigned OpIdx1) const override;
+
 public:
   explicit SIInstrInfo(const AMDGPUSubtarget &st);
 
@@ -117,17 +125,14 @@ public:
   // register.  If there is no hardware instruction that can store to \p
   // DstRC, then AMDGPU::COPY is returned.
   unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
+
+  LLVM_READONLY
   int commuteOpcode(const MachineInstr &MI) const;
 
-  MachineInstr *commuteInstruction(MachineInstr *MI,
-                                   bool NewMI = false) const override;
   bool findCommutedOpIndices(MachineInstr *MI,
                              unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
 
-  bool isTriviallyReMaterializable(const MachineInstr *MI,
-                                   AliasAnalysis *AA = nullptr) const;
-
   bool areMemAccessesTriviallyDisjoint(
     MachineInstr *MIa, MachineInstr *MIb,
     AliasAnalysis *AA = nullptr) const override;
@@ -137,8 +142,6 @@ public:
                               unsigned DstReg, unsigned SrcReg) const override;
   bool isMov(unsigned Opcode) const override;
 
-  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
-
   bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
                      unsigned Reg, MachineRegisterInfo *MRI) const final;
 
@@ -148,78 +151,154 @@ public:
                                       MachineBasicBlock::iterator &MI,
                                       LiveVariables *LV) const override;
 
+  static bool isSALU(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SALU;
+  }
+
   bool isSALU(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SALU;
   }
 
+  static bool isVALU(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VALU;
+  }
+
   bool isVALU(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VALU;
   }
 
+  static bool isSOP1(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SOP1;
+  }
+
   bool isSOP1(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOP1;
   }
 
+  static bool isSOP2(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SOP2;
+  }
+
   bool isSOP2(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOP2;
   }
 
+  static bool isSOPC(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SOPC;
+  }
+
   bool isSOPC(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOPC;
   }
 
+  static bool isSOPK(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SOPK;
+  }
+
   bool isSOPK(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOPK;
   }
 
+  static bool isSOPP(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SOPP;
+  }
+
   bool isSOPP(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOPP;
   }
 
+  static bool isVOP1(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VOP1;
+  }
+
   bool isVOP1(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VOP1;
   }
 
+  static bool isVOP2(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VOP2;
+  }
+
   bool isVOP2(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VOP2;
   }
 
+  static bool isVOP3(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VOP3;
+  }
+
   bool isVOP3(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VOP3;
   }
 
+  static bool isVOPC(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VOPC;
+  }
+
   bool isVOPC(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VOPC;
   }
 
+  static bool isMUBUF(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::MUBUF;
+  }
+
   bool isMUBUF(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::MUBUF;
   }
 
+  static bool isMTBUF(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::MTBUF;
+  }
+
   bool isMTBUF(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
   }
 
+  static bool isSMRD(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SMRD;
+  }
+
   bool isSMRD(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SMRD;
   }
 
+  static bool isDS(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::DS;
+  }
+
   bool isDS(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::DS;
   }
 
+  static bool isMIMG(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::MIMG;
+  }
+
   bool isMIMG(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::MIMG;
   }
 
+  static bool isFLAT(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::FLAT;
+  }
+
   bool isFLAT(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FLAT;
   }
 
+  static bool isWQM(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::WQM;
+  }
+
   bool isWQM(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::WQM;
   }
 
+  static bool isVGPRSpill(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill;
+  }
+
   bool isVGPRSpill(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill;
   }
@@ -302,6 +381,26 @@ public:
   bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
                       const MachineOperand *MO = nullptr) const;
 
+  /// \brief Check if \p MO would be a valid operand for the given operand
+  /// definition \p OpInfo. Note this does not attempt to validate constant bus
+  /// restrictions (e.g. literal constant usage).
+  bool isLegalVSrcOperand(const MachineRegisterInfo &MRI,
+                          const MCOperandInfo &OpInfo,
+                          const MachineOperand &MO) const;
+
+  /// \brief Check if \p MO (a register operand) is a legal register for the
+  /// given operand description.
+  bool isLegalRegOperand(const MachineRegisterInfo &MRI,
+                         const MCOperandInfo &OpInfo,
+                         const MachineOperand &MO) const;
+
+  /// \brief Legalize operands in \p MI by either commuting it or inserting a
+  /// copy of src1.
+  void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr *MI) const;
+
+  /// \brief Fix operands in \p MI to satisfy constant bus requirements.
+  void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr *MI) const;
+
   /// \brief Legalize all operands in this instruction.  This function may
   /// create new instruction and insert them before \p MI.
   void legalizeOperands(MachineInstr *MI) const;
@@ -312,7 +411,8 @@ public:
                  unsigned HalfImmOp, unsigned HalfSGPROp,
                  MachineInstr *&Lo, MachineInstr *&Hi) const;
 
-  void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const;
+  void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI,
+                      SmallVectorImpl<MachineInstr *> &Worklist) const;
 
   /// \brief Replace this instruction's opcode with the equivalent VALU
   /// opcode.  This function will also move the users of \p MI to the
@@ -341,29 +441,49 @@ public:
   void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I,
               unsigned SavReg, unsigned IndexReg) const;
 
-  void insertNOPs(MachineBasicBlock::iterator MI, int Count) const;
+  void insertWaitStates(MachineBasicBlock::iterator MI, int Count) const;
 
   /// \brief Returns the operand named \p Op.  If \p MI does not have an
   /// operand named \c Op, this function returns nullptr.
+  LLVM_READONLY
   MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const;
 
+  LLVM_READONLY
   const MachineOperand *getNamedOperand(const MachineInstr &MI,
                                         unsigned OpName) const {
     return getNamedOperand(const_cast<MachineInstr &>(MI), OpName);
   }
 
+  /// Get required immediate operand
+  int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const {
+    int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
+    return MI.getOperand(Idx).getImm();
+  }
+
   uint64_t getDefaultRsrcDataFormat() const;
   uint64_t getScratchRsrcWords23() const;
 };
 
 namespace AMDGPU {
-
+  LLVM_READONLY
   int getVOPe64(uint16_t Opcode);
+
+  LLVM_READONLY
   int getVOPe32(uint16_t Opcode);
+
+  LLVM_READONLY
   int getCommuteRev(uint16_t Opcode);
+
+  LLVM_READONLY
   int getCommuteOrig(uint16_t Opcode);
+
+  LLVM_READONLY
   int getAddr64Inst(uint16_t Opcode);
+
+  LLVM_READONLY
   int getAtomicRetOp(uint16_t Opcode);
+
+  LLVM_READONLY
   int getAtomicNoRetOp(uint16_t Opcode);
 
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 8d8110b..10f2adde 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -8,9 +8,9 @@
 //===----------------------------------------------------------------------===//
 def isCI : Predicate<"Subtarget->getGeneration() "
                       ">= AMDGPUSubtarget::SEA_ISLANDS">;
-def isVI : Predicate <
-  "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
-  AssemblerPredicate<"FeatureGCN3Encoding">;
+def isCIOnly : Predicate<"Subtarget->getGeneration() =="
+                         "AMDGPUSubtarget::SEA_ISLANDS">,
+  AssemblerPredicate <"FeatureSeaIslands">;
 
 def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
 
@@ -69,6 +69,15 @@ class sopk <bits<5> si, bits<5> vi = si> {
   field bits<5> VI = vi;
 }
 
+// Specify an SMRD opcode for SI and SMEM opcode for VI
+
+// FIXME: This should really be bits<5> si, Tablegen crashes if
+// parameter default value is other parameter with different bit size
+class smrd<bits<8> si, bits<8> vi = si> {
+  field bits<5> SI = si{4-0};
+  field bits<8> VI = vi;
+}
+
 // Execpt for the NONE field, this must be kept in sync with the SISubtarget enum
 // in AMDGPUInstrInfo.cpp
 def SISubtarget {
@@ -121,9 +130,20 @@ def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
 def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
 
 def SIconstdata_ptr : SDNode<
-  "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]>
+  "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, i64>,
+                                                     SDTCisVT<0, i64>]>
 >;
 
+def mubuf_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
+	return isGlobalLoad(cast<LoadSDNode>(N)) ||
+         isConstantLoad(cast<LoadSDNode>(N), -1);
+}]>;
+
+def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
+  return isConstantLoad(cast<LoadSDNode>(N), -1) &&
+  static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N);
+}]>;
+
 //===----------------------------------------------------------------------===//
 // SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1
 // to be glued to the memory instructions.
@@ -328,9 +348,9 @@ class SGPRImm <dag frag> : PatLeaf<frag, [{
       static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
   for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
                                                 U != E; ++U) {
-    if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) {
+    const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
+    if (RC && SIRI->isSGPRClass(RC))
       return true;
-    }
   }
   return false;
 }]>;
@@ -354,6 +374,8 @@ def sopp_brtarget : Operand<OtherVT> {
   let ParserMatchClass = SoppBrTarget;
 }
 
+def const_ga : Operand<iPTR>;
+
 include "SIInstrFormats.td"
 include "VIInstrFormats.td"
 
@@ -393,7 +415,7 @@ def GDS01MatchClass : GDSBaseMatchClass <"parseDSOff01OptionalOps">;
 class GLCBaseMatchClass <string parser> : AsmOperandClass {
   let Name = "GLC"#parser;
   let PredicateMethod = "isImm";
-  let ParserMethod = parser; 
+  let ParserMethod = parser;
   let RenderMethod = "addImmOperands";
 }
 
@@ -436,6 +458,17 @@ def ClampMatchClass : AsmOperandClass {
   let RenderMethod = "addImmOperands";
 }
 
+class SMRDOffsetBaseMatchClass <string predicate> : AsmOperandClass {
+  let Name = "SMRDOffset"#predicate;
+  let PredicateMethod = predicate;
+  let RenderMethod = "addImmOperands";
+}
+
+def SMRDOffsetMatchClass : SMRDOffsetBaseMatchClass <"isSMRDOffset">;
+def SMRDLiteralOffsetMatchClass : SMRDOffsetBaseMatchClass <
+  "isSMRDLiteralOffset"
+>;
+
 let OperandType = "OPERAND_IMMEDIATE" in {
 
 def offen : Operand<i1> {
@@ -510,6 +543,16 @@ def ClampMod : Operand <i1> {
   let ParserMatchClass = ClampMatchClass;
 }
 
+def smrd_offset : Operand <i32> {
+  let PrintMethod = "printU32ImmOperand";
+  let ParserMatchClass = SMRDOffsetMatchClass;
+}
+
+def smrd_literal_offset : Operand <i32> {
+  let PrintMethod = "printU32ImmOperand";
+  let ParserMatchClass = SMRDLiteralOffsetMatchClass;
+}
+
 } // End OperandType = "OPERAND_IMMEDIATE"
 
 def VOPDstS64 : VOPDstOperand <SReg_64>;
@@ -528,6 +571,13 @@ def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
 def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
 def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
 
+def SMRDImm   : ComplexPattern<i64, 2, "SelectSMRDImm">;
+def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">;
+def SMRDSgpr  : ComplexPattern<i64, 2, "SelectSMRDSgpr">;
+def SMRDBufferImm   : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
+def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
+def SMRDBufferSgpr  : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">;
+
 def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
 def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">;
 def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
@@ -717,19 +767,6 @@ class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> :
   let AssemblerPredicates = [isVI];
 }
 
-multiclass SOP2_SELECT_32 <sop2 op, string opName, list<dag> pattern> {
-  def "" : SOP2_Pseudo <opName, (outs SReg_32:$dst),
-    (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), pattern>;
-
-  def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst),
-    (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
-    opName#" $dst, $src0, $src1 [$scc]">;
-
-  def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst),
-    (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
-    opName#" $dst, $src0, $src1 [$scc]">;
-}
-
 multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm,
                    list<dag> pattern> {
 
@@ -758,8 +795,10 @@ multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
 
 class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
                     string opName, PatLeaf cond> : SOPC <
-  op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1),
-  opName#" $src0, $src1", []>;
+  op, (outs), (ins rc:$src0, rc:$src1),
+  opName#" $src0, $src1", []> {
+  let Defs = [SCC];
+}
 
 class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL>
   : SOPC_Helper<op, SSrc_32, i32, opName, cond>;
@@ -812,15 +851,20 @@ multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> {
 }
 
 multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> {
-  def "" : SOPK_Pseudo <opName, (outs SCCReg:$dst),
-    (ins SReg_32:$src0, u16imm:$src1), pattern>;
+  def "" : SOPK_Pseudo <opName, (outs),
+    (ins SReg_32:$src0, u16imm:$src1), pattern> {
+    let Defs = [SCC];
+  }
+
 
-  let DisableEncoding = "$dst" in {
-    def _si : SOPK_Real_si <op, opName, (outs SCCReg:$dst),
-      (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">;
+  def _si : SOPK_Real_si <op, opName, (outs),
+    (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> {
+    let Defs = [SCC];
+  }
 
-    def _vi : SOPK_Real_vi <op, opName, (outs SCCReg:$dst),
-      (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">;
+  def _vi : SOPK_Real_vi <op, opName, (outs),
+    (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> {
+    let Defs = [SCC];
   }
 }
 
@@ -868,35 +912,68 @@ class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins,
 }
 
 class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins,
-                    string asm> :
-  SMRD <outs, ins, asm, []>,
+                    string asm, list<dag> pattern = []> :
+  SMRD <outs, ins, asm, pattern>,
   SMEMe_vi <op, imm>,
   SIMCInstr<opName, SISubtarget.VI> {
   let AssemblerPredicates = [isVI];
 }
 
-multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins,
+multiclass SMRD_m <smrd op, string opName, bit imm, dag outs, dag ins,
                    string asm, list<dag> pattern> {
 
   def "" : SMRD_Pseudo <opName, outs, ins, pattern>;
 
-  def _si : SMRD_Real_si <op, opName, imm, outs, ins, asm>;
+  def _si : SMRD_Real_si <op.SI, opName, imm, outs, ins, asm>;
 
   // glc is only applicable to scalar stores, which are not yet
   // implemented.
   let glc = 0 in {
-    def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>;
+    def _vi : SMRD_Real_vi <op.VI, opName, imm, outs, ins, asm>;
   }
 }
 
-multiclass SMRD_Helper <bits<5> op, string opName, RegisterClass baseClass,
+multiclass SMRD_Inval <smrd op, string opName,
+                       SDPatternOperator node> {
+  let hasSideEffects = 1, mayStore = 1 in {
+    def "" : SMRD_Pseudo <opName, (outs), (ins), [(node)]>;
+
+    let sbase = 0, offset = 0 in {
+      let sdst = 0 in {
+        def _si : SMRD_Real_si <op.SI, opName, 0, (outs), (ins), opName>;
+      }
+
+      let glc = 0, sdata = 0 in {
+        def _vi : SMRD_Real_vi <op.VI, opName, 0, (outs), (ins), opName>;
+      }
+    }
+  }
+}
+
+class SMEM_Inval <bits<8> op, string opName, SDPatternOperator node> :
+  SMRD_Real_vi<op, opName, 0, (outs), (ins), opName, [(node)]> {
+  let hasSideEffects = 1;
+  let mayStore = 1;
+  let sbase = 0;
+  let sdata = 0;
+  let glc = 0;
+  let offset = 0;
+}
+
+multiclass SMRD_Helper <smrd op, string opName, RegisterClass baseClass,
                         RegisterClass dstClass> {
   defm _IMM : SMRD_m <
     op, opName#"_IMM", 1, (outs dstClass:$dst),
-    (ins baseClass:$sbase, u32imm:$offset),
+    (ins baseClass:$sbase, smrd_offset:$offset),
     opName#" $dst, $sbase, $offset", []
   >;
 
+  def _IMM_ci : SMRD <
+    (outs dstClass:$dst), (ins baseClass:$sbase, smrd_literal_offset:$offset),
+    opName#" $dst, $sbase, $offset", []>, SMRD_IMMe_ci <op.SI> {
+    let AssemblerPredicates = [isCIOnly];
+  }
+
   defm _SGPR : SMRD_m <
     op, opName#"_SGPR", 0, (outs dstClass:$dst),
     (ins baseClass:$sbase, SReg_32:$soff),
@@ -922,11 +999,12 @@ def InputModsNoDefault : Operand <i32> {
   let ParserMatchClass = InputModsMatchClass;
 }
 
-class getNumSrcArgs<ValueType Src1, ValueType Src2> {
+class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> {
   int ret =
-    !if (!eq(Src1.Value, untyped.Value),      1,   // VOP1
+    !if (!eq(Src0.Value, untyped.Value),      0,
+      !if (!eq(Src1.Value, untyped.Value),    1,   // VOP1
          !if (!eq(Src2.Value, untyped.Value), 2,   // VOP2
-                                              3)); // VOP3
+                                              3))); // VOP3
 }
 
 // Returns the register class to use for the destination of VOP[123C]
@@ -934,28 +1012,37 @@ class getNumSrcArgs<ValueType Src1, ValueType Src2> {
 class getVALUDstForVT<ValueType VT> {
   RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>,
                           !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
-                            VOPDstOperand<SReg_64>)); // else VT == i1
+                            !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>,
+                            VOPDstOperand<SReg_64>))); // else VT == i1
 }
 
 // Returns the register class to use for source 0 of VOP[12C]
 // instructions for the given VT.
 class getVOPSrc0ForVT<ValueType VT> {
-  RegisterOperand ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64);
+  RegisterOperand ret = !if(!eq(VT.Size, 64), VSrc_64, VSrc_32);
 }
 
 // Returns the register class to use for source 1 of VOP[12C] for the
 // given VT.
 class getVOPSrc1ForVT<ValueType VT> {
-  RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, VReg_64);
+  RegisterClass ret = !if(!eq(VT.Size, 64), VReg_64, VGPR_32);
 }
 
 // Returns the register class to use for sources of VOP3 instructions for the
 // given VT.
 class getVOP3SrcForVT<ValueType VT> {
-  RegisterOperand ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64);
+  RegisterOperand ret =
+  !if(!eq(VT.Size, 64),
+      VCSrc_64,
+      !if(!eq(VT.Value, i1.Value),
+          SCSrc_64,
+          VCSrc_32
+       )
+    );
 }
 
 // Returns 1 if the source arguments have modifiers, 0 if they do not.
+// XXX - do f16 instructions?
 class hasModifiers<ValueType SrcVT> {
   bit ret = !if(!eq(SrcVT.Value, f32.Value), 1,
             !if(!eq(SrcVT.Value, f64.Value), 1, 0));
@@ -1009,17 +1096,20 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
 // Returns the assembly string for the inputs and outputs of a VOP[12C]
 // instruction.  This does not add the _e32 suffix, so it can be reused
 // by getAsm64.
-class getAsm32 <int NumSrcArgs> {
+class getAsm32 <bit HasDst, int NumSrcArgs> {
+  string dst = "$dst";
+  string src0 = ", $src0";
   string src1 = ", $src1";
   string src2 = ", $src2";
-  string ret = "$dst, $src0"#
-               !if(!eq(NumSrcArgs, 1), "", src1)#
-               !if(!eq(NumSrcArgs, 3), src2, "");
+  string ret = !if(HasDst, dst, "") #
+               !if(!eq(NumSrcArgs, 1), src0, "") #
+               !if(!eq(NumSrcArgs, 2), src0#src1, "") #
+               !if(!eq(NumSrcArgs, 3), src0#src1#src2, "");
 }
 
 // Returns the assembly string for the inputs and outputs of a VOP3
 // instruction.
-class getAsm64 <int NumSrcArgs, bit HasModifiers> {
+class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers> {
   string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
   string src1 = !if(!eq(NumSrcArgs, 1), "",
                    !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
@@ -1027,11 +1117,10 @@ class getAsm64 <int NumSrcArgs, bit HasModifiers> {
   string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
   string ret =
   !if(!eq(HasModifiers, 0),
-      getAsm32<NumSrcArgs>.ret,
+      getAsm32<HasDst, NumSrcArgs>.ret,
       "$dst, "#src0#src1#src2#"$clamp"#"$omod");
 }
 
-
 class VOPProfile <list<ValueType> _ArgVT> {
 
   field list<ValueType> ArgVT = _ArgVT;
@@ -1047,29 +1136,38 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
   field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
 
-  field int NumSrcArgs = getNumSrcArgs<Src1VT, Src2VT>.ret;
+  field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
+  field bit HasDst32 = HasDst;
+  field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret;
   field bit HasModifiers = hasModifiers<Src0VT>.ret;
 
-  field dag Outs = (outs DstRC:$dst);
+  field dag Outs = !if(HasDst,(outs DstRC:$dst),(outs));
+
+  // VOP3b instructions are a special case with a second explicit
+  // output. This is manually overridden for them.
+  field dag Outs32 = Outs;
+  field dag Outs64 = Outs;
 
   field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
   field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
                              HasModifiers>.ret;
 
-  field string Asm32 = getAsm32<NumSrcArgs>.ret;
-  field string Asm64 = getAsm64<NumSrcArgs, HasModifiers>.ret;
+  field string Asm32 = getAsm32<HasDst, NumSrcArgs>.ret;
+  field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers>.ret;
 }
 
 // FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order
 //        for the instruction patterns to work.
-def VOP_F16_F16 : VOPProfile <[f32, f32, untyped, untyped]>;
-def VOP_F16_I16 : VOPProfile <[f32, i32, untyped, untyped]>;
-def VOP_I16_F16 : VOPProfile <[i32, f32, untyped, untyped]>;
+def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>;
+def VOP_F16_I16 : VOPProfile <[f16, i32, untyped, untyped]>;
+def VOP_I16_F16 : VOPProfile <[i32, f16, untyped, untyped]>;
 
-def VOP_F16_F16_F16 : VOPProfile <[f32, f32, f32, untyped]>;
-def VOP_F16_F16_I16 : VOPProfile <[f32, f32, i32, untyped]>;
+def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
+def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i32, untyped]>;
 def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>;
 
+def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>;
+
 def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>;
 def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>;
 def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>;
@@ -1087,25 +1185,76 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
 def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
 def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
 def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
-def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> {
+
+// Write out to vcc or arbitrary SGPR.
+def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {
+  let Asm32 = "$dst, vcc, $src0, $src1";
+  let Asm64 = "$dst, $sdst, $src0, $src1";
+  let Outs32 = (outs DstRC:$dst);
+  let Outs64 = (outs DstRC:$dst, SReg_64:$sdst);
+}
+
+// Write out to vcc or arbitrary SGPR and read in from vcc or
+// arbitrary SGPR.
+def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
+  // We use VCSrc_32 to exclude literal constants, even though the
+  // encoding normally allows them since the implicit VCC use means
+  // using one would always violate the constant bus
+  // restriction. SGPRs are still allowed because it should
+  // technically be possible to use VCC again as src0.
   let Src0RC32 = VCSrc_32;
+  let Asm32 = "$dst, vcc, $src0, $src1, vcc";
+  let Asm64 = "$dst, $sdst, $src0, $src1, $src2";
+  let Outs32 = (outs DstRC:$dst);
+  let Outs64 = (outs DstRC:$dst, SReg_64:$sdst);
+
+  // Suppress src2 implied by type since the 32-bit encoding uses an
+  // implicit VCC use.
+  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
 }
 
-def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> {
-  let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
-  let Asm64 = "$dst, $src0_modifiers, $src1";
+class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
+  let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+  let Asm64 = "$vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod";
+}
+
+def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> {
+  // FIXME: Hack to stop printing _e64
+  let DstRC = RegisterOperand<VGPR_32>;
 }
 
-def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> {
+def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> {
+  // FIXME: Hack to stop printing _e64
+  let DstRC = RegisterOperand<VReg_64>;
+}
+
+// VOPC instructions are a special case because for the 32-bit
+// encoding, we want to display the implicit vcc write as if it were
+// an explicit $dst.
+class VOPC_Profile<ValueType vt0, ValueType vt1 = vt0> : VOPProfile <[i1, vt0, vt1, untyped]> {
+  let Asm32 = "vcc, $src0, $src1";
+  // The destination for 32-bit encoding is implicit.
+  let HasDst32 = 0;
+}
+
+class VOPC_Class_Profile<ValueType vt> : VOPC_Profile<vt, i32> {
   let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
   let Asm64 = "$dst, $src0_modifiers, $src1";
 }
 
+def VOPC_I1_F32_F32 : VOPC_Profile<f32>;
+def VOPC_I1_F64_F64 : VOPC_Profile<f64>;
+def VOPC_I1_I32_I32 : VOPC_Profile<i32>;
+def VOPC_I1_I64_I64 : VOPC_Profile<i64>;
+
+def VOPC_I1_F32_I32 : VOPC_Class_Profile<f32>;
+def VOPC_I1_F64_I32 : VOPC_Class_Profile<f64>;
+
 def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
 def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
 def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>;
 def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> {
-  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VCCReg:$src2);
+  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
   let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2);
   let Asm64 = "$dst, $src0, $src1, $src2";
 }
@@ -1119,13 +1268,60 @@ def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> {
   let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
   let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
                              HasModifiers>.ret;
-  let Asm32 = getAsm32<2>.ret;
-  let Asm64 = getAsm64<2, HasModifiers>.ret;
+  let Asm32 = getAsm32<1, 2>.ret;
+  let Asm64 = getAsm64<1, 2, HasModifiers>.ret;
 }
 def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
 def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
 def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
 
+class SIInstAlias <string asm, Instruction inst, VOPProfile p> :
+    InstAlias <asm, (inst)>, PredicateControl {
+
+  field bit isCompare;
+  field bit isCommutable;
+
+  let ResultInst =
+    !if (p.HasDst32,
+      !if (!eq(p.NumSrcArgs, 0),
+        // 1 dst, 0 src
+        (inst p.DstRC:$dst),
+      !if (!eq(p.NumSrcArgs, 1),
+        // 1 dst, 1 src
+        (inst p.DstRC:$dst, p.Src0RC32:$src0),
+      !if (!eq(p.NumSrcArgs, 2),
+        // 1 dst, 2 src
+        (inst p.DstRC:$dst, p.Src0RC32:$src0, p.Src1RC32:$src1),
+      // else - unreachable
+        (inst)))),
+    // else
+      !if (!eq(p.NumSrcArgs, 2),
+        // 0 dst, 2 src
+        (inst p.Src0RC32:$src0, p.Src1RC32:$src1),
+      !if (!eq(p.NumSrcArgs, 1),
+        // 0 dst, 1 src
+        (inst p.Src0RC32:$src1),
+      // else
+        // 0 dst, 0 src
+        (inst))));
+}
+
+class SIInstAliasSI <string asm, string op_name, VOPProfile p> :
+  SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_si"), p> {
+  let AssemblerPredicate = SIAssemblerPredicate;
+}
+
+class SIInstAliasVI <string asm, string op_name, VOPProfile p> :
+  SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_vi"), p> {
+  let AssemblerPredicates = [isVI];
+}
+
+multiclass SIInstAliasBuilder <string asm, VOPProfile p> {
+
+  def : SIInstAliasSI <asm, NAME, p>;
+
+  def : SIInstAliasVI <asm, NAME, p>;
+}
 
 class VOP <string opName> {
   string OpName = opName;
@@ -1165,20 +1361,22 @@ class VOP1_Real_vi <string opName, vop1 op, dag outs, dag ins, string asm> :
   let AssemblerPredicates = [isVI];
 }
 
-multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
-                   string opName> {
-  def "" : VOP1_Pseudo <outs, ins, pattern, opName>;
+multiclass VOP1_m <vop1 op, string opName, VOPProfile p, list<dag> pattern,
+                   string asm = opName#p.Asm32> {
+  def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>;
 
-  def _si : VOP1_Real_si <opName, op, outs, ins, asm>;
+  def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>;
+
+  def _vi : VOP1_Real_vi <opName, op, p.Outs, p.Ins32, asm>;
 
-  def _vi : VOP1_Real_vi <opName, op, outs, ins, asm>;
 }
 
-multiclass VOP1SI_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
-                   string opName> {
-  def "" : VOP1_Pseudo <outs, ins, pattern, opName>;
+multiclass VOP1SI_m <vop1 op, string opName, VOPProfile p, list<dag> pattern,
+                     string asm = opName#p.Asm32> {
+
+  def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>;
 
-  def _si : VOP1_Real_si <opName, op, outs, ins, asm>;
+  def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>;
 }
 
 class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
@@ -1202,22 +1400,24 @@ class VOP2_Real_vi <string opName, vop2 op, dag outs, dag ins, string asm> :
   let AssemblerPredicates = [isVI];
 }
 
-multiclass VOP2SI_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern,
-                     string opName, string revOp> {
-  def "" : VOP2_Pseudo <outs, ins, pattern, opName>,
+multiclass VOP2SI_m <vop2 op, string opName, VOPProfile p, list<dag> pattern,
+                     string revOp> {
+
+  def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>,
            VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
 
-  def _si : VOP2_Real_si <opName, op, outs, ins, asm>;
+  def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>;
 }
 
-multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern,
-                   string opName, string revOp> {
-  def "" : VOP2_Pseudo <outs, ins, pattern, opName>,
+multiclass VOP2_m <vop2 op, string opName, VOPProfile p, list <dag> pattern,
+                   string revOp> {
+
+  def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>,
            VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
 
-  def _si : VOP2_Real_si <opName, op, outs, ins, asm>;
+  def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>;
 
-  def _vi : VOP2_Real_vi <opName, op, outs, ins, asm>;
+  def _vi : VOP2_Real_vi <opName, op, p.Outs32, p.Ins32, p.Asm32>;
 
 }
 
@@ -1250,6 +1450,9 @@ class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
   MnemonicAlias<opName#"_e64", opName> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
+
+  field bit vdst;
+  field bit src0;
 }
 
 class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> :
@@ -1295,22 +1498,6 @@ multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern,
                               HasMods>;
 }
 
-// VOP3_m without source modifiers
-multiclass VOP3_m_nomods <vop op, dag outs, dag ins, string asm, list<dag> pattern,
-                   string opName, int NumSrcArgs, bit HasMods = 1> {
-
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
-
-  let src0_modifiers = 0,
-      src1_modifiers = 0,
-      src2_modifiers = 0,
-      clamp = 0,
-      omod = 0 in {
-    def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>;
-    def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>;
-  }
-}
-
 multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm,
                      list<dag> pattern, string opName, bit HasMods = 1> {
 
@@ -1335,7 +1522,7 @@ multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm,
 
 multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm,
                      list<dag> pattern, string opName, string revOp,
-                     bit HasMods = 1, bit UseFullOp = 0> {
+                     bit HasMods = 1> {
 
   def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
            VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
@@ -1349,7 +1536,7 @@ multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm,
 
 multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm,
                      list<dag> pattern, string opName, string revOp,
-                     bit HasMods = 1, bit UseFullOp = 0> {
+                     bit HasMods = 1> {
 
   def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
            VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
@@ -1360,54 +1547,41 @@ multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm,
   // No VI instruction. This class is for SI only.
 }
 
-// XXX - Is v_div_scale_{f32|f64} only available in vop3b without
-// option of implicit vcc use?
-multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm,
-                      list<dag> pattern, string opName, string revOp,
-                      bit HasMods = 1, bit UseFullOp = 0> {
-  def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
-           VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
-
-  // The VOP2 variant puts the carry out into VCC, the VOP3 variant
-  // can write it into any SGPR. We currently don't use the carry out,
-  // so for now hardcode it to VCC as well.
-  let sdst = SIOperand.VCC, Defs = [VCC] in {
-    def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>,
-              VOP3DisableFields<1, 0, HasMods>;
-
-    def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>,
-              VOP3DisableFields<1, 0, HasMods>;
-  } // End sdst = SIOperand.VCC, Defs = [VCC]
-}
-
-multiclass VOP3b_3_m <vop op, dag outs, dag ins, string asm,
-                      list<dag> pattern, string opName, string revOp,
-                      bit HasMods = 1, bit UseFullOp = 0> {
+// Two operand VOP3b instruction that may have a 3rd SGPR bool operand
+// instead of an implicit VCC as in the VOP2b format.
+multiclass VOP3b_2_3_m <vop op, dag outs, dag ins, string asm,
+                        list<dag> pattern, string opName, string revOp,
+                        bit HasMods = 1, bit useSrc2Input = 0> {
   def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
 
-
   def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>,
-            VOP3DisableFields<1, 1, HasMods>;
+            VOP3DisableFields<1, useSrc2Input, HasMods>;
 
   def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>,
-            VOP3DisableFields<1, 1, HasMods>;
+            VOP3DisableFields<1, useSrc2Input, HasMods>;
 }
 
 multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm,
                      list<dag> pattern, string opName,
-                     bit HasMods, bit defExec, string revOp> {
+                     bit HasMods, bit defExec,
+                     string revOp, list<SchedReadWrite> sched> {
 
   def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
-           VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
+           VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
+    let Defs = !if(defExec, [EXEC], []);
+    let SchedRW = sched;
+  }
 
   def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
             VOP3DisableFields<1, 0, HasMods> {
     let Defs = !if(defExec, [EXEC], []);
+    let SchedRW = sched;
   }
 
   def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
             VOP3DisableFields<1, 0, HasMods> {
     let Defs = !if(defExec, [EXEC], []);
+    let SchedRW = sched;
   }
 }
 
@@ -1432,32 +1606,28 @@ multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins,
   }
 }
 
-multiclass VOP1_Helper <vop1 op, string opName, dag outs,
-                        dag ins32, string asm32, list<dag> pat32,
-                        dag ins64, string asm64, list<dag> pat64,
-                        bit HasMods> {
+multiclass VOP1_Helper <vop1 op, string opName, VOPProfile p, list<dag> pat32,
+                        list<dag> pat64> {
 
-  defm _e32 : VOP1_m <op, outs, ins32, opName#asm32, pat32, opName>;
+  defm _e32 : VOP1_m <op, opName, p, pat32>;
 
-  defm _e64 : VOP3_1_m <op, outs, ins64, opName#asm64, pat64, opName, HasMods>;
+  defm _e64 : VOP3_1_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName,
+                        p.HasModifiers>;
 }
 
 multiclass VOP1Inst <vop1 op, string opName, VOPProfile P,
                      SDPatternOperator node = null_frag> : VOP1_Helper <
-  op, opName, P.Outs,
-  P.Ins32, P.Asm32, [],
-  P.Ins64, P.Asm64,
+  op, opName, P, [],
   !if(P.HasModifiers,
       [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
                                 i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
-      [(set P.DstVT:$dst, (node P.Src0VT:$src0))]),
-  P.HasModifiers
+      [(set P.DstVT:$dst, (node P.Src0VT:$src0))])
 >;
 
 multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P,
                        SDPatternOperator node = null_frag> {
 
-  defm _e32 : VOP1SI_m <op, P.Outs, P.Ins32, opName#P.Asm32, [], opName>;
+  defm _e32 : VOP1SI_m <op, opName, P, []>;
 
   defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64,
     !if(P.HasModifiers,
@@ -1467,36 +1637,33 @@ multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P,
     opName, P.HasModifiers>;
 }
 
-multiclass VOP2_Helper <vop2 op, string opName, dag outs,
-                        dag ins32, string asm32, list<dag> pat32,
-                        dag ins64, string asm64, list<dag> pat64,
-                        string revOp, bit HasMods> {
-  defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>;
+multiclass VOP2_Helper <vop2 op, string opName, VOPProfile p, list<dag> pat32,
+                        list<dag> pat64, string revOp> {
 
-  defm _e64 : VOP3_2_m <op,
-    outs, ins64, opName#asm64, pat64, opName, revOp, HasMods
-  >;
+  defm _e32 : VOP2_m <op, opName, p, pat32, revOp>;
+
+  defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName,
+                        revOp, p.HasModifiers>;
 }
 
 multiclass VOP2Inst <vop2 op, string opName, VOPProfile P,
                      SDPatternOperator node = null_frag,
                      string revOp = opName> : VOP2_Helper <
-  op, opName, P.Outs,
-  P.Ins32, P.Asm32, [],
-  P.Ins64, P.Asm64,
+  op, opName, P, [],
   !if(P.HasModifiers,
       [(set P.DstVT:$dst,
            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                       i1:$clamp, i32:$omod)),
                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
       [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
-  revOp, P.HasModifiers
+  revOp
 >;
 
 multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P,
                        SDPatternOperator node = null_frag,
                        string revOp = opName> {
-  defm _e32 : VOP2SI_m <op, P.Outs, P.Ins32, P.Asm32, [], opName, revOp>;
+
+  defm _e32 : VOP2SI_m <op, opName, P, [], revOp>;
 
   defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#P.Asm64,
     !if(P.HasModifiers,
@@ -1508,58 +1675,55 @@ multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P,
     opName, revOp, P.HasModifiers>;
 }
 
-multiclass VOP2b_Helper <vop2 op, string opName, dag outs,
-                         dag ins32, string asm32, list<dag> pat32,
-                         dag ins64, string asm64, list<dag> pat64,
-                         string revOp, bit HasMods> {
+multiclass VOP2b_Helper <vop2 op, string opName, VOPProfile p,
+                         list<dag> pat32, list<dag> pat64,
+                         string revOp, bit useSGPRInput> {
 
-  defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>;
+  let SchedRW = [Write32Bit, WriteSALU] in {
+    let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
+      defm _e32 : VOP2_m <op, opName, p, pat32, revOp>;
+    }
 
-  defm _e64 : VOP3b_2_m <op,
-    outs, ins64, opName#asm64, pat64, opName, revOp, HasMods
-  >;
+    defm _e64 : VOP3b_2_3_m <op, p.Outs64, p.Ins64, opName#p.Asm64, pat64,
+                             opName, revOp, p.HasModifiers, useSGPRInput>;
+  }
 }
 
 multiclass VOP2bInst <vop2 op, string opName, VOPProfile P,
                       SDPatternOperator node = null_frag,
                       string revOp = opName> : VOP2b_Helper <
-  op, opName, P.Outs,
-  P.Ins32, P.Asm32, [],
-  P.Ins64, P.Asm64,
+  op, opName, P, [],
   !if(P.HasModifiers,
       [(set P.DstVT:$dst,
            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                       i1:$clamp, i32:$omod)),
                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
       [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
-  revOp, P.HasModifiers
+  revOp, !eq(P.NumSrcArgs, 3)
 >;
 
 // A VOP2 instruction that is VOP3-only on VI.
-multiclass VOP2_VI3_Helper <vop23 op, string opName, dag outs,
-                            dag ins32, string asm32, list<dag> pat32,
-                            dag ins64, string asm64, list<dag> pat64,
-                            string revOp, bit HasMods> {
-  defm _e32 : VOP2SI_m <op, outs, ins32, asm32, pat32, opName, revOp>;
+multiclass VOP2_VI3_Helper <vop23 op, string opName, VOPProfile p,
+                            list<dag> pat32, list<dag> pat64, string revOp> {
 
-  defm _e64 : VOP3_2_m <op, outs, ins64, opName#asm64, pat64, opName,
-                        revOp, HasMods>;
+  defm _e32 : VOP2SI_m <op, opName, p, pat32, revOp>;
+
+  defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName,
+                        revOp, p.HasModifiers>;
 }
 
 multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P,
                           SDPatternOperator node = null_frag,
                           string revOp = opName>
                           : VOP2_VI3_Helper <
-  op, opName, P.Outs,
-  P.Ins32, P.Asm32, [],
-  P.Ins64, P.Asm64,
+  op, opName, P, [],
   !if(P.HasModifiers,
       [(set P.DstVT:$dst,
            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
                                       i1:$clamp, i32:$omod)),
                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
       [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
-  revOp, P.HasModifiers
+  revOp
 >;
 
 multiclass VOP2MADK <vop2 op, string opName, list<dag> pattern = []> {
@@ -1583,64 +1747,75 @@ let isCodeGenOnly = 0 in {
 } // End isCodeGenOnly = 0
 }
 
-class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
+class VOPC_Pseudo <dag ins, list<dag> pattern, string opName> :
   VOPCCommon <ins, "", pattern>,
   VOP <opName>,
-  SIMCInstr<opName#"_e32", SISubtarget.NONE>,
-  MnemonicAlias<opName#"_e32", opName> {
+  SIMCInstr<opName#"_e32", SISubtarget.NONE> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
 
-multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern,
-                   string opName, bit DefExec, string revOpName = ""> {
-  def "" : VOPC_Pseudo <outs, ins, pattern, opName>;
-
-  def _si : VOPC<op.SI, ins, asm, []>,
-            SIMCInstr <opName#"_e32", SISubtarget.SI> {
-    let Defs = !if(DefExec, [EXEC], []);
-    let hasSideEffects = DefExec;
-    let AssemblerPredicates = [isSICI];
+multiclass VOPC_m <vopc op, dag ins, string op_asm, list<dag> pattern,
+                   string opName, bit DefExec, VOPProfile p,
+                   list<SchedReadWrite> sched,
+                   string revOpName = "", string asm = opName#"_e32 "#op_asm,
+                   string alias_asm = opName#" "#op_asm> {
+  def "" : VOPC_Pseudo <ins, pattern, opName> {
+    let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+    let SchedRW = sched;
   }
 
-  def _vi : VOPC<op.VI, ins, asm, []>,
-            SIMCInstr <opName#"_e32", SISubtarget.VI> {
-    let Defs = !if(DefExec, [EXEC], []);
-    let hasSideEffects = DefExec;
-    let AssemblerPredicates = [isVI];
-  }
+  let AssemblerPredicates = [isSICI] in {
+    def _si : VOPC<op.SI, ins, asm, []>,
+              SIMCInstr <opName#"_e32", SISubtarget.SI> {
+      let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+      let hasSideEffects = DefExec;
+      let SchedRW = sched;
+    }
+
+  } // End AssemblerPredicates = [isSICI]
+
+  let AssemblerPredicates = [isVI] in {
+    def _vi : VOPC<op.VI, ins, asm, []>,
+              SIMCInstr <opName#"_e32", SISubtarget.VI> {
+      let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+      let hasSideEffects = DefExec;
+      let SchedRW = sched;
+    }
+
+  } // End AssemblerPredicates = [isVI]
+
+  defm : SIInstAliasBuilder<alias_asm, p>;
 }
 
-multiclass VOPC_Helper <vopc op, string opName,
-                        dag ins32, string asm32, list<dag> pat32,
-                        dag out64, dag ins64, string asm64, list<dag> pat64,
-                        bit HasMods, bit DefExec, string revOp> {
-  defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>;
+multiclass VOPC_Helper <vopc op, string opName, list<dag> pat32,
+                        list<dag> pat64, bit DefExec, string revOp,
+                        VOPProfile p, list<SchedReadWrite> sched> {
+  defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>;
 
-  defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64,
-                        opName, HasMods, DefExec, revOp>;
+  defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64,
+                        opName, p.HasModifiers, DefExec, revOp, sched>;
 }
 
 // Special case for class instructions which only have modifiers on
 // the 1st source operand.
-multiclass VOPC_Class_Helper <vopc op, string opName,
-                             dag ins32, string asm32, list<dag> pat32,
-                             dag out64, dag ins64, string asm64, list<dag> pat64,
-                             bit HasMods, bit DefExec, string revOp> {
-  defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>;
-
-  defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64,
-                        opName, HasMods, DefExec, revOp>,
+multiclass VOPC_Class_Helper <vopc op, string opName, list<dag> pat32,
+                              list<dag> pat64, bit DefExec, string revOp,
+                              VOPProfile p, list<SchedReadWrite> sched> {
+  defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>;
+
+  defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64,
+                        opName, p.HasModifiers, DefExec, revOp, sched>,
                         VOP3DisableModFields<1, 0, 0>;
 }
 
 multiclass VOPCInst <vopc op, string opName,
                      VOPProfile P, PatLeaf cond = COND_NULL,
                      string revOp = opName,
-                     bit DefExec = 0> : VOPC_Helper <
-  op, opName,
-  P.Ins32, P.Asm32, [],
-  (outs VOPDstS64:$dst), P.Ins64, P.Asm64,
+                     bit DefExec = 0,
+                     list<SchedReadWrite> sched = [Write32Bit]> :
+                     VOPC_Helper <
+  op, opName, [],
   !if(P.HasModifiers,
       [(set i1:$dst,
           (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
@@ -1648,51 +1823,51 @@ multiclass VOPCInst <vopc op, string opName,
                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
                  cond))],
       [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]),
-  P.HasModifiers, DefExec, revOp
+  DefExec, revOp, P, sched
 >;
 
 multiclass VOPCClassInst <vopc op, string opName, VOPProfile P,
-                     bit DefExec = 0> : VOPC_Class_Helper <
-  op, opName,
-  P.Ins32, P.Asm32, [],
-  (outs VOPDstS64:$dst), P.Ins64, P.Asm64,
+                     bit DefExec = 0,
+                     list<SchedReadWrite> sched> : VOPC_Class_Helper <
+  op, opName, [],
   !if(P.HasModifiers,
       [(set i1:$dst,
           (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))],
       [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]),
-  P.HasModifiers, DefExec, opName
+  DefExec, opName, P, sched
 >;
 
 
 multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
-  VOPCInst <op, opName, VOP_F32_F32_F32, cond, revOp>;
+  VOPCInst <op, opName, VOPC_I1_F32_F32, cond, revOp>;
 
 multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
-  VOPCInst <op, opName, VOP_F64_F64_F64, cond, revOp>;
+  VOPCInst <op, opName, VOPC_I1_F64_F64, cond, revOp, 0, [WriteDoubleAdd]>;
 
 multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
-  VOPCInst <op, opName, VOP_I32_I32_I32, cond, revOp>;
+  VOPCInst <op, opName, VOPC_I1_I32_I32, cond, revOp>;
 
 multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
-  VOPCInst <op, opName, VOP_I64_I64_I64, cond, revOp>;
+  VOPCInst <op, opName, VOPC_I1_I64_I64, cond, revOp, 0, [Write64Bit]>;
 
 
 multiclass VOPCX <vopc op, string opName, VOPProfile P,
                   PatLeaf cond = COND_NULL,
+                  list<SchedReadWrite> sched,
                   string revOp = "">
-  : VOPCInst <op, opName, P, cond, revOp, 1>;
+  : VOPCInst <op, opName, P, cond, revOp, 1, sched>;
 
 multiclass VOPCX_F32 <vopc op, string opName, string revOp = opName> :
-  VOPCX <op, opName, VOP_F32_F32_F32, COND_NULL, revOp>;
+  VOPCX <op, opName, VOPC_I1_F32_F32, COND_NULL, [Write32Bit], revOp>;
 
 multiclass VOPCX_F64 <vopc op, string opName, string revOp = opName> :
-  VOPCX <op, opName, VOP_F64_F64_F64, COND_NULL, revOp>;
+  VOPCX <op, opName, VOPC_I1_F64_F64, COND_NULL, [WriteDoubleAdd], revOp>;
 
 multiclass VOPCX_I32 <vopc op, string opName, string revOp = opName> :
-  VOPCX <op, opName, VOP_I32_I32_I32, COND_NULL, revOp>;
+  VOPCX <op, opName, VOPC_I1_I32_I32, COND_NULL, [Write32Bit], revOp>;
 
 multiclass VOPCX_I64 <vopc op, string opName, string revOp = opName> :
-  VOPCX <op, opName, VOP_I64_I64_I64, COND_NULL, revOp>;
+  VOPCX <op, opName, VOPC_I1_I64_I64, COND_NULL, [Write64Bit], revOp>;
 
 multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm,
                         list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m <
@@ -1700,16 +1875,16 @@ multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm,
 >;
 
 multiclass VOPC_CLASS_F32 <vopc op, string opName> :
-  VOPCClassInst <op, opName, VOP_I1_F32_I32, 0>;
+  VOPCClassInst <op, opName, VOPC_I1_F32_I32, 0, [Write32Bit]>;
 
 multiclass VOPCX_CLASS_F32 <vopc op, string opName> :
-  VOPCClassInst <op, opName, VOP_I1_F32_I32, 1>;
+  VOPCClassInst <op, opName, VOPC_I1_F32_I32, 1, [Write32Bit]>;
 
 multiclass VOPC_CLASS_F64 <vopc op, string opName> :
-  VOPCClassInst <op, opName, VOP_I1_F64_I32, 0>;
+  VOPCClassInst <op, opName, VOPC_I1_F64_I32, 0, [WriteDoubleAdd]>;
 
 multiclass VOPCX_CLASS_F64 <vopc op, string opName> :
-  VOPCClassInst <op, opName, VOP_I1_F64_I32, 1>;
+  VOPCClassInst <op, opName, VOPC_I1_F64_I32, 1, [WriteDoubleAdd]>;
 
 multiclass VOP3Inst <vop3 op, string opName, VOPProfile P,
                      SDPatternOperator node = null_frag> : VOP3_Helper <
@@ -1761,25 +1936,13 @@ multiclass VOP3_VCC_Inst <vop3 op, string opName,
   3, 1
 >;
 
-multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterOperand arc,
-                    string opName, list<dag> pattern> :
-  VOP3b_3_m <
-  op, (outs vrc:$vdst, SReg_64:$sdst),
-      (ins InputModsNoDefault:$src0_modifiers, arc:$src0,
-           InputModsNoDefault:$src1_modifiers, arc:$src1,
-           InputModsNoDefault:$src2_modifiers, arc:$src2,
-           ClampMod:$clamp, omod:$omod),
-  opName#" $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", pattern,
-  opName, opName, 1, 1
+multiclass VOP3bInst <vop op, string opName, VOPProfile P, list<dag> pattern = []> :
+  VOP3b_2_3_m <
+  op, P.Outs64, P.Ins64,
+  opName#" "#P.Asm64, pattern,
+  opName, "", 1, 1
 >;
 
-multiclass VOP3b_64 <vop3 op, string opName, list<dag> pattern> :
-  VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>;
-
-multiclass VOP3b_32 <vop3 op, string opName, list<dag> pattern> :
-  VOP3b_Helper <op, VGPR_32, VSrc_32, opName, pattern>;
-
-
 class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat<
   (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
         (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
@@ -1925,12 +2088,14 @@ multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc,
   dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds),
   string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> {
 
-  def "" : DS_Pseudo <opName, outs, ins, []>,
-           AtomicNoRet<noRetOp, 1>;
+  let hasPostISelHook = 1 in {
+    def "" : DS_Pseudo <opName, outs, ins, []>,
+             AtomicNoRet<noRetOp, 1>;
 
-  let data1 = 0 in {
-    def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
-    def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+    let data1 = 0 in {
+      def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
+      def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+    }
   }
 }
 
@@ -1939,11 +2104,13 @@ multiclass DS_1A2D_RET_m <bits<8> op, string opName, RegisterClass rc,
   dag outs = (outs rc:$vdst),
   string asm = opName#" $vdst, $addr, $data0, $data1"#"$offset"#"$gds"> {
 
-  def "" : DS_Pseudo <opName, outs, ins, []>,
-           AtomicNoRet<noRetOp, 1>;
+  let hasPostISelHook = 1 in {
+    def "" : DS_Pseudo <opName, outs, ins, []>,
+             AtomicNoRet<noRetOp, 1>;
 
-  def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
-  def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+    def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
+    def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+  }
 }
 
 multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc,
@@ -2214,7 +2381,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
 
       defm _ADDR64 : MUBUFAtomicAddr64_m <
         op, name#"_addr64", (outs),
-        (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
+        (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc,
              SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc),
         name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0
       >;
@@ -2233,7 +2400,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
 
       defm _RTN_ADDR64 : MUBUFAtomicAddr64_m <
         op, name#"_rtn_addr64", (outs rc:$vdata),
-        (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr,
+        (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc,
              SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc),
         name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc",
         [(set vt:$vdata,
@@ -2245,7 +2412,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
         op, name#"_rtn_offset", (outs rc:$vdata),
         (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset,
              mbuf_offset:$offset, slc:$slc),
-        name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc",
+        name#" $vdata, $srsrc, $soffset"#"$offset"#" glc$slc",
         [(set vt:$vdata,
          (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset,
                                     i1:$slc), vt:$vdata_in))], 1
@@ -2256,6 +2423,8 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
   } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1
 }
 
+// FIXME: tfe can't be an operand because it requires a separate
+// opcode because it needs an N+1 register class dest register.
 multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass,
                               ValueType load_vt = i32,
                               SDPatternOperator ld = null_frag> {
@@ -2368,47 +2537,121 @@ multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass,
   } // End mayLoad = 0, mayStore = 1
 }
 
-class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> :
-      FLAT <op, (outs regClass:$vdst),
-                (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
-            asm#" $vdst, $addr"#"$glc"#"$slc"#"$tfe", []> {
-  let data = 0;
-  let mayLoad = 1;
+// For cache invalidation instructions.
+multiclass MUBUF_Invalidate <mubuf op, string opName, SDPatternOperator node> {
+  let hasSideEffects = 1, mayStore = 1, AsmMatchConverter = "" in {
+    def "" : MUBUF_Pseudo <opName, (outs), (ins), [(node)]>;
+
+    // Set everything to 0.
+    let offset = 0, offen = 0, idxen = 0, glc = 0, vaddr = 0,
+        vdata = 0, srsrc = 0, slc = 0, tfe = 0, soffset = 0 in {
+      let addr64 = 0 in {
+        def _si : MUBUF_Real_si <op, opName, (outs), (ins), opName>;
+      }
+
+      def _vi : MUBUF_Real_vi <op, opName, (outs), (ins), opName>;
+    }
+  } // End hasSideEffects = 1, mayStore = 1, AsmMatchConverter = ""
 }
 
-class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
-      FLAT <op, (outs), (ins vdataClass:$data, VReg_64:$addr,
-                             glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
-          name#" $data, $addr"#"$glc"#"$slc"#"$tfe",
-         []> {
+//===----------------------------------------------------------------------===//
+// FLAT classes
+//===----------------------------------------------------------------------===//
+
+class flat <bits<7> ci, bits<7> vi = ci> {
+  field bits<7> CI = ci;
+  field bits<7> VI = vi;
+}
 
-  let mayLoad = 0;
-  let mayStore = 1;
+class FLAT_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+     FLAT <0, outs, ins, "", pattern>,
+      SIMCInstr<opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
 
-  // Encoding
-  let vdst = 0;
+class FLAT_Real_ci <bits<7> op, string opName, dag outs, dag ins, string asm> :
+    FLAT <op, outs, ins, asm, []>,
+    SIMCInstr<opName, SISubtarget.SI> {
+  let AssemblerPredicate = isCIOnly;
 }
 
-multiclass FLAT_ATOMIC <bits<7> op, string name, RegisterClass vdst_rc,
-                        RegisterClass data_rc = vdst_rc> {
+class FLAT_Real_vi <bits<7> op, string opName, dag outs, dag ins, string asm> :
+    FLAT <op, outs, ins, asm, []>,
+    SIMCInstr<opName, SISubtarget.VI> {
+  let AssemblerPredicate = VIAssemblerPredicate;
+}
 
-  let mayLoad = 1, mayStore = 1 in {
-    def "" : FLAT <op, (outs),
-                  (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
-                       tfe_flat_atomic:$tfe),
-                   name#" $addr, $data"#"$slc"#"$tfe", []>,
-             AtomicNoRet <NAME, 0> {
-      let glc = 0;
-      let vdst = 0;
-    }
+multiclass FLAT_AtomicRet_m <flat op, dag outs, dag ins, string asm,
+                   list<dag> pattern> {
+  def "" : FLAT_Pseudo <NAME#"_RTN", outs, ins, pattern>,
+               AtomicNoRet <NAME, 1>;
 
-    def _RTN : FLAT <op, (outs vdst_rc:$vdst),
-                     (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
-                          tfe_flat_atomic:$tfe),
-                     name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>,
-               AtomicNoRet <NAME, 1> {
-      let glc = 1;
-    }
+  def _ci : FLAT_Real_ci <op.CI, NAME#"_RTN", outs, ins, asm>;
+
+  def _vi : FLAT_Real_vi <op.VI, NAME#"_RTN", outs, ins, asm>;
+}
+
+multiclass FLAT_Load_Helper <flat op, string asm_name,
+    RegisterClass regClass,
+    dag outs = (outs regClass:$vdst),
+    dag ins = (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
+    string asm = asm_name#" $vdst, $addr"#"$glc"#"$slc"#"$tfe"> {
+
+  let data = 0, mayLoad = 1 in {
+
+    def "" : FLAT_Pseudo <NAME, outs, ins, []>;
+
+    def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>;
+
+    def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>;
+  }
+}
+
+multiclass FLAT_Store_Helper <flat op, string asm_name,
+    RegisterClass vdataClass,
+    dag outs = (outs),
+    dag ins = (ins vdataClass:$data, VReg_64:$addr, glc_flat:$glc,
+                   slc_flat:$slc, tfe_flat:$tfe),
+    string asm = asm_name#" $data, $addr"#"$glc"#"$slc"#"$tfe"> {
+
+  let mayLoad = 0, mayStore = 1, vdst = 0 in {
+
+    def "" : FLAT_Pseudo <NAME, outs, ins, []>;
+
+    def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>;
+
+    def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>;
+  }
+}
+
+multiclass FLAT_ATOMIC <flat op, string asm_name, RegisterClass vdst_rc,
+    RegisterClass data_rc = vdst_rc,
+    dag outs_noret = (outs),
+    string asm_noret = asm_name#" $addr, $data"#"$slc"#"$tfe"> {
+
+  let mayLoad = 1, mayStore = 1, glc = 0, vdst = 0 in {
+    def "" : FLAT_Pseudo <NAME, outs_noret,
+                          (ins VReg_64:$addr, data_rc:$data,
+                               slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), []>,
+             AtomicNoRet <NAME, 0>;
+
+    def _ci : FLAT_Real_ci <op.CI, NAME, outs_noret,
+                            (ins VReg_64:$addr, data_rc:$data,
+                                 slc_flat_atomic:$slc, tfe_flat_atomic:$tfe),
+                            asm_noret>;
+
+    def _vi : FLAT_Real_vi <op.VI, NAME, outs_noret,
+                            (ins VReg_64:$addr, data_rc:$data,
+                                 slc_flat_atomic:$slc, tfe_flat_atomic:$tfe),
+                            asm_noret>;
+  }
+
+  let glc = 1, hasPostISelHook = 1 in {
+    defm _RTN : FLAT_AtomicRet_m <op, (outs vdst_rc:$vdst),
+                        (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
+                             tfe_flat_atomic:$tfe),
+                        asm_name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>;
   }
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
index e0eeea9..6f653c7 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -30,7 +30,9 @@ def isGCN : Predicate<"Subtarget->getGeneration() "
                       ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">,
             AssemblerPredicate<"FeatureGCN">;
 def isSI : Predicate<"Subtarget->getGeneration() "
-                      "== AMDGPUSubtarget::SOUTHERN_ISLANDS">;
+                      "== AMDGPUSubtarget::SOUTHERN_ISLANDS">,
+           AssemblerPredicate<"FeatureSouthernIslands">;
+
 
 def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
 def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
@@ -62,36 +64,38 @@ let mayLoad = 1 in {
 // We are using the SGPR_32 and not the SReg_32 register class for 32-bit
 // SMRD instructions, because the SGPR_32 register class does not include M0
 // and writing to M0 from an SMRD instruction will hang the GPU.
-defm S_LOAD_DWORD : SMRD_Helper <0x00, "s_load_dword", SReg_64, SGPR_32>;
-defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "s_load_dwordx2", SReg_64, SReg_64>;
-defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "s_load_dwordx4", SReg_64, SReg_128>;
-defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "s_load_dwordx8", SReg_64, SReg_256>;
-defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "s_load_dwordx16", SReg_64, SReg_512>;
+defm S_LOAD_DWORD : SMRD_Helper <smrd<0x00>, "s_load_dword", SReg_64, SGPR_32>;
+defm S_LOAD_DWORDX2 : SMRD_Helper <smrd<0x01>, "s_load_dwordx2", SReg_64, SReg_64>;
+defm S_LOAD_DWORDX4 : SMRD_Helper <smrd<0x02>, "s_load_dwordx4", SReg_64, SReg_128>;
+defm S_LOAD_DWORDX8 : SMRD_Helper <smrd<0x03>, "s_load_dwordx8", SReg_64, SReg_256>;
+defm S_LOAD_DWORDX16 : SMRD_Helper <smrd<0x04>, "s_load_dwordx16", SReg_64, SReg_512>;
 
 defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
-  0x08, "s_buffer_load_dword", SReg_128, SGPR_32
+  smrd<0x08>, "s_buffer_load_dword", SReg_128, SGPR_32
 >;
 
 defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
-  0x09, "s_buffer_load_dwordx2", SReg_128, SReg_64
+  smrd<0x09>, "s_buffer_load_dwordx2", SReg_128, SReg_64
 >;
 
 defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
-  0x0a, "s_buffer_load_dwordx4", SReg_128, SReg_128
+  smrd<0x0a>, "s_buffer_load_dwordx4", SReg_128, SReg_128
 >;
 
 defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
-  0x0b, "s_buffer_load_dwordx8", SReg_128, SReg_256
+  smrd<0x0b>, "s_buffer_load_dwordx8", SReg_128, SReg_256
 >;
 
 defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
-  0x0c, "s_buffer_load_dwordx16", SReg_128, SReg_512
+  smrd<0x0c>, "s_buffer_load_dwordx16", SReg_128, SReg_512
 >;
 
 } // mayLoad = 1
 
 //def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>;
-//def S_DCACHE_INV : SMRD_ <0x0000001f, "s_dcache_inv", []>;
+
+defm S_DCACHE_INV : SMRD_Inval <smrd<0x1f, 0x20>, "s_dcache_inv",
+  int_amdgcn_s_dcache_inv>;
 
 //===----------------------------------------------------------------------===//
 // SOP1 Instructions
@@ -123,7 +127,7 @@ let Defs = [SCC] in {
 
 
 defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32",
-  [(set i32:$dst, (AMDGPUbrev i32:$src0))]
+  [(set i32:$dst, (bitreverse i32:$src0))]
 >;
 defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>;
 
@@ -183,10 +187,14 @@ defm S_XNOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2b, 0x27>, "s_xnor_saveexec_b64", []>
 
 defm S_QUADMASK_B32 : SOP1_32 <sop1<0x2c, 0x28>, "s_quadmask_b32", []>;
 defm S_QUADMASK_B64 : SOP1_64 <sop1<0x2d, 0x29>, "s_quadmask_b64", []>;
+
+let Uses = [M0] in {
 defm S_MOVRELS_B32 : SOP1_32 <sop1<0x2e, 0x2a>, "s_movrels_b32", []>;
 defm S_MOVRELS_B64 : SOP1_64 <sop1<0x2f, 0x2b>, "s_movrels_b64", []>;
 defm S_MOVRELD_B32 : SOP1_32 <sop1<0x30, 0x2c>, "s_movreld_b32", []>;
 defm S_MOVRELD_B64 : SOP1_64 <sop1<0x31, 0x2d>, "s_movreld_b64", []>;
+} // End Uses = [M0]
+
 defm S_CBRANCH_JOIN : SOP1_1 <sop1<0x32, 0x2e>, "s_cbranch_join", []>;
 defm S_MOV_REGRD_B32 : SOP1_32 <sop1<0x33, 0x2f>, "s_mov_regrd_b32", []>;
 let Defs = [SCC] in {
@@ -354,7 +362,7 @@ def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">;
 // SOPK Instructions
 //===----------------------------------------------------------------------===//
 
-let isReMaterializable = 1 in {
+let isReMaterializable = 1, isMoveImm = 1 in {
 defm S_MOVK_I32 : SOPK_32 <sopk<0x00>, "s_movk_i32", []>;
 } // End isReMaterializable = 1
 let Uses = [SCC] in {
@@ -438,36 +446,38 @@ def S_BRANCH : SOPP <
   let isBarrier = 1;
 }
 
-let DisableEncoding = "$scc" in {
+let Uses = [SCC] in {
 def S_CBRANCH_SCC0 : SOPP <
-  0x00000004, (ins sopp_brtarget:$simm16, SCCReg:$scc),
+  0x00000004, (ins sopp_brtarget:$simm16),
   "s_cbranch_scc0 $simm16"
 >;
 def S_CBRANCH_SCC1 : SOPP <
-  0x00000005, (ins sopp_brtarget:$simm16, SCCReg:$scc),
+  0x00000005, (ins sopp_brtarget:$simm16),
   "s_cbranch_scc1 $simm16"
 >;
-} // End DisableEncoding = "$scc"
+} // End Uses = [SCC]
 
+let Uses = [VCC] in {
 def S_CBRANCH_VCCZ : SOPP <
-  0x00000006, (ins sopp_brtarget:$simm16, VCCReg:$vcc),
+  0x00000006, (ins sopp_brtarget:$simm16),
   "s_cbranch_vccz $simm16"
 >;
 def S_CBRANCH_VCCNZ : SOPP <
-  0x00000007, (ins sopp_brtarget:$simm16, VCCReg:$vcc),
+  0x00000007, (ins sopp_brtarget:$simm16),
   "s_cbranch_vccnz $simm16"
 >;
+} // End Uses = [VCC]
 
-let DisableEncoding = "$exec" in {
+let Uses = [EXEC] in {
 def S_CBRANCH_EXECZ : SOPP <
-  0x00000008, (ins sopp_brtarget:$simm16, EXECReg:$exec),
+  0x00000008, (ins sopp_brtarget:$simm16),
   "s_cbranch_execz $simm16"
 >;
 def S_CBRANCH_EXECNZ : SOPP <
-  0x00000009, (ins sopp_brtarget:$simm16, EXECReg:$exec),
+  0x00000009, (ins sopp_brtarget:$simm16),
   "s_cbranch_execnz $simm16"
 >;
-} // End DisableEncoding = "$exec"
+} // End Uses = [EXEC]
 
 
 } // End isBranch = 1
@@ -477,11 +487,11 @@ let hasSideEffects = 1 in {
 def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
   [(int_AMDGPU_barrier_local)]
 > {
+  let SchedRW = [WriteBarrier];
   let simm16 = 0;
-  let isBarrier = 1;
-  let hasCtrlDep = 1;
   let mayLoad = 1;
   let mayStore = 1;
+  let isConvergent = 1;
 }
 
 def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
@@ -805,9 +815,6 @@ defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmps
 defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
 defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
 defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
-let SubtargetPredicate = isCI in {
-defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">;
-} // End isCI
 defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>;
 let mayStore = 0 in {
 defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>;
@@ -905,11 +912,6 @@ defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">;
 defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">;
 defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">;
 
-//let SubtargetPredicate = isCI in {
-// DS_CONDXCHG32_RTN_B64
-// DS_CONDXCHG32_RTN_B128
-//} // End isCI
-
 //===----------------------------------------------------------------------===//
 // MUBUF Instructions
 //===----------------------------------------------------------------------===//
@@ -951,13 +953,13 @@ defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <
   mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global
 >;
 defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <
-  mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load
+  mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, mubuf_load
 >;
 defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <
-  mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load
+  mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load
 >;
 defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <
-  mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load
+  mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load
 >;
 
 defm BUFFER_STORE_BYTE : MUBUF_Store_Helper <
@@ -1034,9 +1036,12 @@ defm BUFFER_ATOMIC_XOR : MUBUF_Atomic <
 //def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI
 //def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI
 //def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI
-//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <mubuf<0x70>, "buffer_wbinvl1_sc", []>; // isn't on CI & VI
-//def BUFFER_WBINVL1_VOL : MUBUF_WBINVL1 <mubuf<0x70, 0x3f>, "buffer_wbinvl1_vol", []>; // isn't on SI
-//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <mubuf<0x71, 0x3e>, "buffer_wbinvl1", []>;
+
+let SubtargetPredicate = isSI in {
+defm BUFFER_WBINVL1_SC : MUBUF_Invalidate <mubuf<0x70>, "buffer_wbinvl1_sc", int_amdgcn_buffer_wbinvl1_sc>; // isn't on CI & VI
+}
+
+defm BUFFER_WBINVL1 : MUBUF_Invalidate <mubuf<0x71, 0x3e>, "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1>;
 
 //===----------------------------------------------------------------------===//
 // MTBUF Instructions
@@ -1155,8 +1160,8 @@ defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o"
 // VOP1 Instructions
 //===----------------------------------------------------------------------===//
 
-let vdst = 0, src0 = 0 in {
-defm V_NOP : VOP1_m <vop1<0x0>, (outs), (ins), "v_nop", [], "v_nop">;
+let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in {
+defm V_NOP : VOP1Inst <vop1<0x0>, "v_nop", VOP_NONE>;
 }
 
 let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
@@ -1292,7 +1297,9 @@ defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64",
   VOP_F64_F64, fsqrt
 >;
 
-} // let SchedRW = [WriteDouble]
+} // End SchedRW = [WriteDouble]
+
+let SchedRW = [WriteQuarterRate32] in {
 
 defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32",
   VOP_F32_F32, AMDGPUsin
@@ -1300,6 +1307,9 @@ defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32",
 defm V_COS_F32 : VOP1Inst <vop1<0x36, 0x2a>, "v_cos_f32",
   VOP_F32_F32, AMDGPUcos
 >;
+
+} // End SchedRW = [WriteQuarterRate32]
+
 defm V_NOT_B32 : VOP1Inst <vop1<0x37, 0x2b>, "v_not_b32", VOP_I32_I32>;
 defm V_BFREV_B32 : VOP1Inst <vop1<0x38, 0x2c>, "v_bfrev_b32", VOP_I32_I32>;
 defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>;
@@ -1308,24 +1318,33 @@ defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>;
 defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64",
   VOP_I32_F64
 >;
+
+let SchedRW = [WriteDoubleAdd] in {
 defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64",
   VOP_F64_F64
 >;
-defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", VOP_F64_F64>;
+
+defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64",
+  VOP_F64_F64
+>;
+} // End SchedRW = [WriteDoubleAdd]
+
+
 defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32",
   VOP_I32_F32
 >;
 defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32",
   VOP_F32_F32
 >;
-let vdst = 0, src0 = 0 in {
-defm V_CLREXCP : VOP1_m <vop1<0x41,0x35>, (outs), (ins), "v_clrexcp", [],
-  "v_clrexcp"
->;
+let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in {
+defm V_CLREXCP : VOP1Inst <vop1<0x41,0x35>, "v_clrexcp", VOP_NONE>;
 }
+
+let Uses = [M0, EXEC] in {
 defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>;
 defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>;
 defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>;
+} // End Uses = [M0, EXEC]
 
 // These instruction only exist on SI and CI
 let SubtargetPredicate = isSICI in {
@@ -1343,7 +1362,7 @@ defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32",
   VOP_F32_F32, AMDGPUrsq_legacy
 >;
 
-} // End let SchedRW = [WriteQuarterRate32]
+} // End SchedRW = [WriteQuarterRate32]
 
 let SchedRW = [WriteDouble] in {
 
@@ -1360,7 +1379,7 @@ defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64",
 // VINTRP Instructions
 //===----------------------------------------------------------------------===//
 
-let Uses = [M0] in {
+let Uses = [M0, EXEC] in {
 
 // FIXME: Specify SchedRW for VINTRP insturctions.
 
@@ -1405,16 +1424,14 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
   [(set f32:$dst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan),
                                     (i32 imm:$attr)))]>;
 
-} // End Uses = [M0]
+} // End Uses = [M0, EXEC]
 
 //===----------------------------------------------------------------------===//
 // VOP2 Instructions
 //===----------------------------------------------------------------------===//
 
 multiclass V_CNDMASK <vop2 op, string name> {
-  defm _e32 : VOP2_m <
-      op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins32, VOP_CNDMASK.Asm32, [],
-      name, name>;
+  defm _e32 : VOP2_m <op, name, VOP_CNDMASK, [], name>;
 
   defm _e64  : VOP3_m <
       op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64,
@@ -1500,34 +1517,32 @@ let isCommutable = 1 in {
 defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32">;
 } // End isCommutable = 1
 
-let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
+let isCommutable = 1 in {
 // No patterns so that the scalar instructions are always selected.
 // The scalar versions will be replaced with vector when needed later.
 
 // V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI,
 // but the VI instructions behave the same as the SI versions.
 defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32",
-  VOP_I32_I32_I32, add
+  VOP2b_I32_I1_I32_I32
 >;
-defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP_I32_I32_I32>;
+defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP2b_I32_I1_I32_I32>;
 
 defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32",
-  VOP_I32_I32_I32, null_frag, "v_sub_i32"
+  VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32"
 >;
 
-let Uses = [VCC] in { // Carry-in comes from VCC
 defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32",
-  VOP_I32_I32_I32_VCC
+  VOP2b_I32_I1_I32_I32_I1
 >;
 defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32",
-  VOP_I32_I32_I32_VCC
+  VOP2b_I32_I1_I32_I32_I1
 >;
 defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32",
-  VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32"
+  VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32"
 >;
 
-} // End Uses = [VCC]
-} // End isCommutable = 1, Defs = [VCC]
+} // End isCommutable = 1
 
 defm V_READLANE_B32 : VOP2SI_3VI_m <
   vop3 <0x001, 0x289>,
@@ -1575,10 +1590,10 @@ defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32",
   VOP_I32_I32_I32
 >;
 defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst <vop23<0x23, 0x28c>, "v_mbcnt_lo_u32_b32",
-  VOP_I32_I32_I32
+  VOP_I32_I32_I32, int_amdgcn_mbcnt_lo
 >;
 defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst <vop23<0x24, 0x28d>, "v_mbcnt_hi_u32_b32",
-  VOP_I32_I32_I32
+  VOP_I32_I32_I32, int_amdgcn_mbcnt_hi
 >;
 defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32",
   VOP_F32_F32_I32, AMDGPUldexp
@@ -1704,15 +1719,15 @@ defm V_DIV_FIXUP_F32 : VOP3Inst <
   vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup
 >;
 
-let SchedRW = [WriteDouble] in {
+let SchedRW = [WriteDoubleAdd] in {
 
 defm V_DIV_FIXUP_F64 : VOP3Inst <
   vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup
 >;
 
-} // let SchedRW = [WriteDouble]
+} // End SchedRW = [WriteDouble]
 
-let SchedRW = [WriteDouble] in {
+let SchedRW = [WriteDoubleAdd] in {
 let isCommutable = 1 in {
 
 defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64",
@@ -1735,7 +1750,7 @@ defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64",
   VOP_F64_F64_I32, AMDGPUldexp
 >;
 
-} // let SchedRW = [WriteDouble]
+} // let SchedRW = [WriteDoubleAdd]
 
 let isCommutable = 1, SchedRW = [WriteQuarterRate32] in {
 
@@ -1756,16 +1771,21 @@ defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32",
 } // isCommutable = 1, SchedRW = [WriteQuarterRate32]
 
 let SchedRW = [WriteFloatFMA, WriteSALU] in {
-defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d, 0x1e0>, "v_div_scale_f32", []>;
+defm V_DIV_SCALE_F32 : VOP3bInst <vop3<0x16d, 0x1e0>, "v_div_scale_f32",
+  VOP3b_F32_I1_F32_F32_F32
+>;
 }
 
 let SchedRW = [WriteDouble, WriteSALU] in {
 // Double precision division pre-scale.
-defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e, 0x1e1>, "v_div_scale_f64", []>;
+defm V_DIV_SCALE_F64 : VOP3bInst <vop3<0x16e, 0x1e1>, "v_div_scale_f64",
+  VOP3b_F64_I1_F64_F64_F64
+>;
 } // let SchedRW = [WriteDouble]
 
-let isCommutable = 1, Uses = [VCC] in {
+let isCommutable = 1, Uses = [VCC, EXEC] in {
 
+let SchedRW = [WriteFloatFMA] in {
 // v_div_fmas_f32:
 //   result = src0 * src1 + src2
 //   if (vcc)
@@ -1774,6 +1794,7 @@ let isCommutable = 1, Uses = [VCC] in {
 defm V_DIV_FMAS_F32 : VOP3_VCC_Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32",
   VOP_F32_F32_F32_F32, AMDGPUdiv_fmas
 >;
+}
 
 let SchedRW = [WriteDouble] in {
 // v_div_fmas_f64:
@@ -1786,7 +1807,7 @@ defm V_DIV_FMAS_F64 : VOP3_VCC_Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64",
 >;
 
 } // End SchedRW = [WriteDouble]
-} // End isCommutable = 1
+} // End isCommutable = 1, Uses = [VCC, EXEC]
 
 //def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>;
 //def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>;
@@ -1835,13 +1856,13 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst),
   (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", []
 >;
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
 // 64-bit vector move instruction.  This is mainly used by the SIFoldOperands
 // pass to enable folding of inline immediates.
 def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>;
 } // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0
 
-let hasSideEffects = 1 in {
+let hasSideEffects = 1, SALU = 1 in {
 def SGPR_USE : InstSI <(outs),(ins), "", []>;
 }
 
@@ -1921,39 +1942,9 @@ def SI_KILL : InstSI <
 
 let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
 
-//defm SI_ : RegisterLoadStore <VGPR_32, FRAMEri, ADDRIndirect>;
-
-let UseNamedOperandTable = 1 in {
-
-def SI_RegisterLoad : InstSI <
+class SI_INDIRECT_SRC<RegisterClass rc> : InstSI <
   (outs VGPR_32:$dst, SReg_64:$temp),
-  (ins FRAMEri32:$addr, i32imm:$chan),
-  "", []
-> {
-  let isRegisterLoad = 1;
-  let mayLoad = 1;
-}
-
-class SIRegStore<dag outs> : InstSI <
-  outs,
-  (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan),
-  "", []
-> {
-  let isRegisterStore = 1;
-  let mayStore = 1;
-}
-
-let usesCustomInserter = 1 in {
-def SI_RegisterStorePseudo : SIRegStore<(outs)>;
-} // End usesCustomInserter = 1
-def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>;
-
-
-} // End UseNamedOperandTable = 1
-
-def SI_INDIRECT_SRC : InstSI <
-  (outs VGPR_32:$dst, SReg_64:$temp),
-  (ins unknown:$src, VSrc_32:$idx, i32imm:$off),
+  (ins rc:$src, VSrc_32:$idx, i32imm:$off),
   "si_indirect_src $dst, $temp, $src, $idx, $off",
   []
 >;
@@ -1967,6 +1958,13 @@ class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
   let Constraints = "$src = $dst";
 }
 
+// TODO: We can support indirect SGPR access.
+def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
+def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
+def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
+def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
+def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>;
+
 def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
 def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
 def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
@@ -1977,19 +1975,24 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
 
 multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
 
-  let UseNamedOperandTable = 1 in {
+  let UseNamedOperandTable = 1, Uses = [EXEC] in {
     def _SAVE : InstSI <
       (outs),
-      (ins sgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
-           SReg_32:$scratch_offset),
+      (ins sgpr_class:$src, i32imm:$frame_idx),
       "", []
-    >;
+    > {
+      let mayStore = 1;
+      let mayLoad = 0;
+    }
 
     def _RESTORE : InstSI <
       (outs sgpr_class:$dst),
-      (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
+      (ins i32imm:$frame_idx),
       "", []
-    >;
+    > {
+      let mayStore = 0;
+      let mayLoad = 1;
+    }
   } // End UseNamedOperandTable = 1
 }
 
@@ -2003,19 +2006,25 @@ defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
 defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
 
 multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
-  let UseNamedOperandTable = 1, VGPRSpill = 1 in {
+  let UseNamedOperandTable = 1, VGPRSpill = 1, Uses = [EXEC] in {
     def _SAVE : InstSI <
       (outs),
       (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
            SReg_32:$scratch_offset),
       "", []
-    >;
+    > {
+      let mayStore = 1;
+      let mayLoad = 0;
+    }
 
     def _RESTORE : InstSI <
       (outs vgpr_class:$dst),
       (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
       "", []
-    >;
+    > {
+      let mayStore = 0;
+      let mayLoad = 1;
+    }
   } // End UseNamedOperandTable = 1, VGPRSpill = 1
 }
 
@@ -2030,9 +2039,11 @@ let Defs = [SCC] in {
 
 def SI_CONSTDATA_PTR : InstSI <
   (outs SReg_64:$dst),
-  (ins),
-  "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))]
->;
+  (ins const_ga:$ptr),
+  "", [(set SReg_64:$dst, (i64 (SIconstdata_ptr (tglobaladdr:$ptr))))]
+> {
+  let SALU = 1;
+}
 
 } // End Defs = [SCC]
 
@@ -2072,84 +2083,63 @@ def : Pat <
 // SMRD Patterns
 //===----------------------------------------------------------------------===//
 
-multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
+multiclass SMRD_Pattern <string Instr, ValueType vt> {
 
-  // 1. SI-CI: Offset as 8bit DWORD immediate
+  // 1. IMM offset
   def : Pat <
-    (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))),
-    (vt (Instr_IMM $sbase, (as_dword_i32imm $offset)))
+    (smrd_load (SMRDImm i64:$sbase, i32:$offset)),
+    (vt (!cast<SMRD>(Instr#"_IMM") $sbase, $offset))
   >;
 
-  // 2. Offset loaded in an 32bit SGPR
+  // 2. SGPR offset
   def : Pat <
-    (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
-    (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
+    (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
+    (vt (!cast<SMRD>(Instr#"_SGPR") $sbase, $offset))
   >;
 
-  // 3. No offset at all
   def : Pat <
-    (constant_load i64:$sbase),
-    (vt (Instr_IMM $sbase, 0))
-  >;
+    (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
+    (vt (!cast<SMRD>(Instr#"_IMM_ci") $sbase, $offset))
+  > {
+    let Predicates = [isCIOnly];
+  }
 }
 
-multiclass SMRD_Pattern_vi <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
-
-  // 1. VI: Offset as 20bit immediate in bytes
-  def : Pat <
-    (constant_load (add i64:$sbase, (i64 IMM20bit:$offset))),
-    (vt (Instr_IMM $sbase, (as_i32imm $offset)))
-  >;
-
-  // 2. Offset loaded in an 32bit SGPR
-  def : Pat <
-    (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
-    (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
-  >;
-
-  // 3. No offset at all
-  def : Pat <
-    (constant_load i64:$sbase),
-    (vt (Instr_IMM $sbase, 0))
-  >;
-}
-
-let Predicates = [isSICI] in {
-defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
-defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
-} // End Predicates = [isSICI]
+// Global and constant loads can be selected to either MUBUF or SMRD
+// instructions, but SMRD instructions are faster so we want the instruction
+// selector to prefer those.
+let AddedComplexity = 100 in {
 
-let Predicates = [isVI] in {
-defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
-} // End Predicates = [isVI]
+defm : SMRD_Pattern <"S_LOAD_DWORD", i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX8", v32i8>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
 
-let Predicates = [isSICI] in {
+// 1. Offset as an immediate
+def : Pat <
+  (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)),
+  (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset)
+>;
 
-// 1. Offset as 8bit DWORD immediate
+// 2. Offset loaded in an 32bit SGPR
 def : Pat <
-  (SIload_constant v4i32:$sbase, IMM8bitDWORD:$offset),
-  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset))
+  (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)),
+  (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset)
 >;
 
-} // End Predicates = [isSICI]
+let Predicates = [isCI] in {
 
-// 2. Offset loaded in an 32bit SGPR
 def : Pat <
-  (SIload_constant v4i32:$sbase, imm:$offset),
-  (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
+  (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)),
+  (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset)
 >;
 
+} // End Predicates = [isCI]
+
+} // End let AddedComplexity = 10000
+
 //===----------------------------------------------------------------------===//
 // SOP1 Patterns
 //===----------------------------------------------------------------------===//
@@ -2161,6 +2151,11 @@ def : Pat <
      (S_MOV_B32 0), sub1))
 >;
 
+def : Pat <
+  (i32 (smax i32:$x, (i32 (ineg i32:$x)))),
+  (S_ABS_I32 $x)
+>;
+
 //===----------------------------------------------------------------------===//
 // SOP2 Patterns
 //===----------------------------------------------------------------------===//
@@ -2488,6 +2483,11 @@ def : Pat <
 /********** Extraction, Insertion, Building and Casting  **********/
 /********** ============================================ **********/
 
+//def : Extract_Element<i64, v2i64, 0, sub0_sub1>;
+//def : Extract_Element<i64, v2i64, 1, sub2_sub3>;
+//def : Extract_Element<f64, v2f64, 0, sub0_sub1>;
+//def : Extract_Element<f64, v2f64, 1, sub2_sub3>;
+
 foreach Index = 0-2 in {
   def Extract_Element_v2i32_#Index : Extract_Element <
     i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -2568,11 +2568,25 @@ def : BitConvert <v2i32, i64, VReg_64>;
 def : BitConvert <i64, v2i32, VReg_64>;
 def : BitConvert <v2f32, i64, VReg_64>;
 def : BitConvert <i64, v2f32, VReg_64>;
+def : BitConvert <v2f32, f64, VReg_64>;
 def : BitConvert <v2i32, f64, VReg_64>;
+def : BitConvert <f64, v2f32, VReg_64>;
 def : BitConvert <f64, v2i32, VReg_64>;
 def : BitConvert <v4f32, v4i32, VReg_128>;
 def : BitConvert <v4i32, v4f32, VReg_128>;
 
+
+def : BitConvert <v2i64, v4i32, SReg_128>;
+def : BitConvert <v4i32, v2i64, SReg_128>;
+
+def : BitConvert <v2f64, v4f32, VReg_128>;
+def : BitConvert <v2f64, v4i32, VReg_128>;
+def : BitConvert <v4f32, v2f64, VReg_128>;
+def : BitConvert <v4i32, v2f64, VReg_128>;
+
+
+
+
 def : BitConvert <v8f32, v8i32, SReg_256>;
 def : BitConvert <v8i32, v8f32, SReg_256>;
 def : BitConvert <v8i32, v32i8, SReg_256>;
@@ -2601,10 +2615,9 @@ def : Pat <
 
 // Prevent expanding both fneg and fabs.
 
-// FIXME: Should use S_OR_B32
 def : Pat <
   (fneg (fabs f32:$src)),
-  (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */
+  (S_OR_B32 $src, 0x80000000) /* Set sign bit */
 >;
 
 // FIXME: Should use S_OR_B32
@@ -2836,10 +2849,6 @@ class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
 // -1. For the non-rtn variants, the manual says it does
 // DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max
 // will always do the increment so I'm assuming it's the same.
-//
-// We also load this -1 with s_mov_b32 / s_mov_b64 even though this
-// needs to be a VGPR. The SGPR copy pass will fix this, and it's
-// easier since there is no v_mov_b64.
 class DSAtomicIncRetPat<DS inst, ValueType vt,
                         Instruction LoadImm, PatFrag frag> : Pat <
   (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)),
@@ -2855,9 +2864,9 @@ class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat <
 
 // 32-bit atomics.
 def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32,
-                        S_MOV_B32, si_atomic_load_add_local>;
+                        V_MOV_B32_e32, si_atomic_load_add_local>;
 def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32,
-                        S_MOV_B32, si_atomic_load_sub_local>;
+                        V_MOV_B32_e32, si_atomic_load_sub_local>;
 
 def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>;
 def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>;
@@ -2874,9 +2883,9 @@ def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>;
 
 // 64-bit atomics.
 def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64,
-                        S_MOV_B64, si_atomic_load_add_local>;
+                        V_MOV_B64_PSEUDO, si_atomic_load_add_local>;
 def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64,
-                        S_MOV_B64, si_atomic_load_sub_local>;
+                        V_MOV_B64_PSEUDO, si_atomic_load_sub_local>;
 
 def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>;
 def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>;
@@ -3019,90 +3028,46 @@ def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
 def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
 def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
 
-let SubtargetPredicate = isCI in {
-
-defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8",
-  VOP_I32_I32_I32
->;
-defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8",
-  VOP_I32_I32_I32
->;
-defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8",
-  VOP_I32_I32_I32
->;
-
-let isCommutable = 1 in {
-defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32",
-  VOP_I64_I32_I32_I64
->;
-
-// XXX - Does this set VCC?
-defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
-  VOP_I64_I32_I32_I64
->;
-} // End isCommutable = 1
-
-// Remaining instructions:
-// FLAT_*
-// S_CBRANCH_CDBGUSER
-// S_CBRANCH_CDBGSYS
-// S_CBRANCH_CDBGSYS_OR_USER
-// S_CBRANCH_CDBGSYS_AND_USER
-// S_DCACHE_INV_VOL
-// DS_NOP
-// DS_GWS_SEMA_RELEASE_ALL
-// DS_WRAP_RTN_B32
-// DS_CNDXCHG32_RTN_B64
-// DS_WRITE_B96
-// DS_WRITE_B128
-// DS_CONDXCHG32_RTN_B128
-// DS_READ_B96
-// DS_READ_B128
-// BUFFER_LOAD_DWORDX3
-// BUFFER_STORE_DWORDX3
-
-} // End isCI
-
 /********** ====================== **********/
 /**********   Indirect adressing   **********/
 /********** ====================== **********/
 
-multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST IndDst> {
+multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
 
   // 1. Extract with offset
   def : Pat<
-    (eltvt (vector_extract vt:$vec, (add i32:$idx, imm:$off))),
-    (SI_INDIRECT_SRC $vec, $idx, imm:$off)
+    (eltvt (extractelt vt:$vec, (add i32:$idx, imm:$off))),
+    (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, imm:$off)
   >;
 
   // 2. Extract without offset
   def : Pat<
-    (eltvt (vector_extract vt:$vec, i32:$idx)),
-    (SI_INDIRECT_SRC $vec, $idx, 0)
+    (eltvt (extractelt vt:$vec, i32:$idx)),
+    (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, 0)
   >;
 
   // 3. Insert with offset
   def : Pat<
-    (vector_insert vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)),
-    (IndDst $vec, $idx, imm:$off, $val)
+    (insertelt vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)),
+    (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, imm:$off, $val)
   >;
 
   // 4. Insert without offset
   def : Pat<
-    (vector_insert vt:$vec, eltvt:$val, i32:$idx),
-    (IndDst $vec, $idx, 0, $val)
+    (insertelt vt:$vec, eltvt:$val, i32:$idx),
+    (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, 0, $val)
   >;
 }
 
-defm : SI_INDIRECT_Pattern <v2f32, f32, SI_INDIRECT_DST_V2>;
-defm : SI_INDIRECT_Pattern <v4f32, f32, SI_INDIRECT_DST_V4>;
-defm : SI_INDIRECT_Pattern <v8f32, f32, SI_INDIRECT_DST_V8>;
-defm : SI_INDIRECT_Pattern <v16f32, f32, SI_INDIRECT_DST_V16>;
+defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
+defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
+defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
+defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;
 
-defm : SI_INDIRECT_Pattern <v2i32, i32, SI_INDIRECT_DST_V2>;
-defm : SI_INDIRECT_Pattern <v4i32, i32, SI_INDIRECT_DST_V4>;
-defm : SI_INDIRECT_Pattern <v8i32, i32, SI_INDIRECT_DST_V8>;
-defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>;
+defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
+defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
+defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
+defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;
 
 //===----------------------------------------------------------------------===//
 // Conversion Patterns
@@ -3215,12 +3180,12 @@ def : Pat <
 
 def : Pat <
   (i1 (trunc i32:$a)),
-  (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1)
+  (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1), $a), 1)
 >;
 
 def : Pat <
   (i1 (trunc i64:$a)),
-  (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1),
+  (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1),
                     (EXTRACT_SUBREG $a, sub0)), 1)
 >;
 
@@ -3301,24 +3266,6 @@ def : Pat <
 
 } // End Predicates = [isSI]
 
-let Predicates = [isCI] in {
-
-// Convert (x - floor(x)) to fract(x)
-def : Pat <
-  (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
-             (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
-  (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
->;
-
-// Convert (x + (-floor(x))) to fract(x)
-def : Pat <
-  (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
-             (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
-  (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
->;
-
-} // End Predicates = [isCI]
-
 //============================================================================//
 // Miscellaneous Optimization Patterns
 //============================================================================//
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index c319b32..126f624 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -103,6 +103,10 @@ public:
     return "SI Lower control flow instructions";
   }
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
 };
 
 } // End anonymous namespace
@@ -140,8 +144,7 @@ void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
 
   DebugLoc DL = From.getDebugLoc();
   BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
-          .addOperand(To)
-          .addReg(AMDGPU::EXEC);
+    .addOperand(To);
 }
 
 void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
@@ -159,8 +162,7 @@ void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
 
   // If the exec mask is non-zero, skip the next two instructions
   BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-          .addImm(3)
-          .addReg(AMDGPU::EXEC);
+    .addImm(3);
 
   // Exec mask is zero: Export to NULL target...
   BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
@@ -269,8 +271,7 @@ void SILowerControlFlowPass::Loop(MachineInstr &MI) {
           .addReg(Src);
 
   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-          .addOperand(MI.getOperand(1))
-          .addReg(AMDGPU::EXEC);
+    .addOperand(MI.getOperand(1));
 
   MI.eraseFromParent();
 }
@@ -316,7 +317,7 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
               .addImm(0);
     }
   } else {
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
            .addImm(0)
            .addOperand(Op);
   }
@@ -362,9 +363,9 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int
             .addReg(AMDGPU::VCC_LO);
 
     // Compare the just read M0 value to all possible Idx values
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
-            .addReg(AMDGPU::M0)
-            .addReg(Idx);
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
+      .addReg(AMDGPU::M0)
+      .addReg(Idx);
 
     // Update EXEC, save the original EXEC value to VCC
     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
@@ -385,8 +386,7 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int
 
     // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-            .addImm(-7)
-            .addReg(AMDGPU::EXEC);
+      .addImm(-7);
 
     // Restore EXEC
     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
@@ -438,7 +438,6 @@ void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
   MachineInstr *MovRel =
     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
             .addReg(Reg)
-            .addReg(AMDGPU::M0, RegState::Implicit)
             .addReg(Vec, RegState::Implicit);
 
   LoadM0(MI, MovRel, Off);
@@ -460,7 +459,6 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
             .addReg(Reg, RegState::Define)
             .addReg(Val)
-            .addReg(AMDGPU::M0, RegState::Implicit)
             .addReg(Dst, RegState::Implicit);
 
   LoadM0(MI, MovRel, Off);
@@ -486,11 +484,11 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
       Next = std::next(I);
 
       MachineInstr &MI = *I;
-      if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode()))
+      if (TII->isWQM(MI) || TII->isDS(MI))
         NeedWQM = true;
 
       // Flat uses m0 in case it needs to access LDS.
-      if (TII->isFLAT(MI.getOpcode()))
+      if (TII->isFLAT(MI))
         NeedFlat = true;
 
       switch (MI.getOpcode()) {
@@ -541,7 +539,11 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
           Branch(MI);
           break;
 
-        case AMDGPU::SI_INDIRECT_SRC:
+        case AMDGPU::SI_INDIRECT_SRC_V1:
+        case AMDGPU::SI_INDIRECT_SRC_V2:
+        case AMDGPU::SI_INDIRECT_SRC_V4:
+        case AMDGPU::SI_INDIRECT_SRC_V8:
+        case AMDGPU::SI_INDIRECT_SRC_V16:
           IndirectSrc(MI);
           break;
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 67421e2..a2fa5fd 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -48,6 +48,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 587ea63..935aad4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -29,10 +29,114 @@ void SIMachineFunctionInfo::anchor() {}
 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : AMDGPUMachineFunction(MF),
     TIDReg(AMDGPU::NoRegister),
-    HasSpilledVGPRs(false),
+    ScratchRSrcReg(AMDGPU::NoRegister),
+    ScratchWaveOffsetReg(AMDGPU::NoRegister),
+    PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
+    DispatchPtrUserSGPR(AMDGPU::NoRegister),
+    QueuePtrUserSGPR(AMDGPU::NoRegister),
+    KernargSegmentPtrUserSGPR(AMDGPU::NoRegister),
+    DispatchIDUserSGPR(AMDGPU::NoRegister),
+    FlatScratchInitUserSGPR(AMDGPU::NoRegister),
+    PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister),
+    GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister),
+    GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister),
+    GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister),
+    WorkGroupIDXSystemSGPR(AMDGPU::NoRegister),
+    WorkGroupIDYSystemSGPR(AMDGPU::NoRegister),
+    WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
+    WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
+    PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
+    LDSWaveSpillSize(0),
     PSInputAddr(0),
     NumUserSGPRs(0),
-    LDSWaveSpillSize(0) { }
+    NumSystemSGPRs(0),
+    HasSpilledSGPRs(false),
+    HasSpilledVGPRs(false),
+    PrivateSegmentBuffer(false),
+    DispatchPtr(false),
+    QueuePtr(false),
+    DispatchID(false),
+    KernargSegmentPtr(false),
+    FlatScratchInit(false),
+    GridWorkgroupCountX(false),
+    GridWorkgroupCountY(false),
+    GridWorkgroupCountZ(false),
+    WorkGroupIDX(true),
+    WorkGroupIDY(false),
+    WorkGroupIDZ(false),
+    WorkGroupInfo(false),
+    PrivateSegmentWaveByteOffset(false),
+    WorkItemIDX(true),
+    WorkItemIDY(false),
+    WorkItemIDZ(false) {
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  const Function *F = MF.getFunction();
+
+  const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+
+  if (getShaderType() == ShaderType::COMPUTE)
+    KernargSegmentPtr = true;
+
+  if (F->hasFnAttribute("amdgpu-work-group-id-y"))
+    WorkGroupIDY = true;
+
+  if (F->hasFnAttribute("amdgpu-work-group-id-z"))
+    WorkGroupIDZ = true;
+
+  if (F->hasFnAttribute("amdgpu-work-item-id-y"))
+    WorkItemIDY = true;
+
+  if (F->hasFnAttribute("amdgpu-work-item-id-z"))
+    WorkItemIDZ = true;
+
+  bool MaySpill = ST.isVGPRSpillingEnabled(this);
+  bool HasStackObjects = FrameInfo->hasStackObjects();
+
+  if (HasStackObjects || MaySpill)
+    PrivateSegmentWaveByteOffset = true;
+
+  if (ST.isAmdHsaOS()) {
+    if (HasStackObjects || MaySpill)
+      PrivateSegmentBuffer = true;
+
+    if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
+      DispatchPtr = true;
+  }
+
+  // X, XY, and XYZ are the only supported combinations, so make sure Y is
+  // enabled if Z is.
+  if (WorkItemIDZ)
+    WorkItemIDY = true;
+}
+
+unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
+  const SIRegisterInfo &TRI) {
+  PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+  NumUserSGPRs += 4;
+  return PrivateSegmentBufferUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
+  DispatchPtrUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return DispatchPtrUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
+  QueuePtrUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return QueuePtrUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
+  KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return KernargSegmentPtrUserSGPR;
+}
 
 SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
                                                        MachineFunction *MF,
@@ -53,7 +157,6 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
   if (!LaneVGPRs.count(LaneVGPRIdx)) {
     unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
     LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
-    MRI.setPhysRegUsed(LaneVGPR);
 
     // Add this register as live-in to all blocks to avoid machine verifer
     // complaining about use of an undefined physical register.
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 667da4c..9c528d6 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -26,13 +26,83 @@ class MachineRegisterInfo;
 /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
 /// tells the hardware which interpolation parameters to load.
 class SIMachineFunctionInfo : public AMDGPUMachineFunction {
+  // FIXME: This should be removed and getPreloadedValue moved here.
+  friend struct SIRegisterInfo;
   void anchor() override;
 
   unsigned TIDReg;
-  bool HasSpilledVGPRs;
+
+  // Registers that may be reserved for spilling purposes. These may be the same
+  // as the input registers.
+  unsigned ScratchRSrcReg;
+  unsigned ScratchWaveOffsetReg;
+
+  // Input registers setup for the HSA ABI.
+  // User SGPRs in allocation order.
+  unsigned PrivateSegmentBufferUserSGPR;
+  unsigned DispatchPtrUserSGPR;
+  unsigned QueuePtrUserSGPR;
+  unsigned KernargSegmentPtrUserSGPR;
+  unsigned DispatchIDUserSGPR;
+  unsigned FlatScratchInitUserSGPR;
+  unsigned PrivateSegmentSizeUserSGPR;
+  unsigned GridWorkGroupCountXUserSGPR;
+  unsigned GridWorkGroupCountYUserSGPR;
+  unsigned GridWorkGroupCountZUserSGPR;
+
+  // System SGPRs in allocation order.
+  unsigned WorkGroupIDXSystemSGPR;
+  unsigned WorkGroupIDYSystemSGPR;
+  unsigned WorkGroupIDZSystemSGPR;
+  unsigned WorkGroupInfoSystemSGPR;
+  unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
 
 public:
+  // FIXME: Make private
+  unsigned LDSWaveSpillSize;
+  unsigned PSInputAddr;
+  std::map<unsigned, unsigned> LaneVGPRs;
+  unsigned ScratchOffsetReg;
+  unsigned NumUserSGPRs;
+  unsigned NumSystemSGPRs;
+
+private:
+  bool HasSpilledSGPRs;
+  bool HasSpilledVGPRs;
+
+  // Feature bits required for inputs passed in user SGPRs.
+  bool PrivateSegmentBuffer : 1;
+  bool DispatchPtr : 1;
+  bool QueuePtr : 1;
+  bool DispatchID : 1;
+  bool KernargSegmentPtr : 1;
+  bool FlatScratchInit : 1;
+  bool GridWorkgroupCountX : 1;
+  bool GridWorkgroupCountY : 1;
+  bool GridWorkgroupCountZ : 1;
+
+  // Feature bits required for inputs passed in system SGPRs.
+  bool WorkGroupIDX : 1; // Always initialized.
+  bool WorkGroupIDY : 1;
+  bool WorkGroupIDZ : 1;
+  bool WorkGroupInfo : 1;
+  bool PrivateSegmentWaveByteOffset : 1;
+
+  bool WorkItemIDX : 1; // Always initialized.
+  bool WorkItemIDY : 1;
+  bool WorkItemIDZ : 1;
+
 
+  MCPhysReg getNextUserSGPR() const {
+    assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
+    return AMDGPU::SGPR0 + NumUserSGPRs;
+  }
+
+  MCPhysReg getNextSystemSGPR() const {
+    return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
+  }
+
+public:
   struct SpilledReg {
     unsigned VGPR;
     int Lane;
@@ -46,16 +116,162 @@ public:
   SIMachineFunctionInfo(const MachineFunction &MF);
   SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex,
                            unsigned SubIdx);
-  unsigned PSInputAddr;
-  unsigned NumUserSGPRs;
-  std::map<unsigned, unsigned> LaneVGPRs;
-  unsigned LDSWaveSpillSize;
-  unsigned ScratchOffsetReg;
   bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; };
   unsigned getTIDReg() const { return TIDReg; };
   void setTIDReg(unsigned Reg) { TIDReg = Reg; }
-  bool hasSpilledVGPRs() const { return HasSpilledVGPRs; }
-  void setHasSpilledVGPRs(bool Spill = true) { HasSpilledVGPRs = Spill; }
+
+  // Add user SGPRs.
+  unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI);
+  unsigned addDispatchPtr(const SIRegisterInfo &TRI);
+  unsigned addQueuePtr(const SIRegisterInfo &TRI);
+  unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
+
+  // Add system SGPRs.
+  unsigned addWorkGroupIDX() {
+    WorkGroupIDXSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupIDXSystemSGPR;
+  }
+
+  unsigned addWorkGroupIDY() {
+    WorkGroupIDYSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupIDYSystemSGPR;
+  }
+
+  unsigned addWorkGroupIDZ() {
+    WorkGroupIDZSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupIDZSystemSGPR;
+  }
+
+  unsigned addWorkGroupInfo() {
+    WorkGroupInfoSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return WorkGroupInfoSystemSGPR;
+  }
+
+  unsigned addPrivateSegmentWaveByteOffset() {
+    PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR();
+    NumSystemSGPRs += 1;
+    return PrivateSegmentWaveByteOffsetSystemSGPR;
+  }
+
+  bool hasPrivateSegmentBuffer() const {
+    return PrivateSegmentBuffer;
+  }
+
+  bool hasDispatchPtr() const {
+    return DispatchPtr;
+  }
+
+  bool hasQueuePtr() const {
+    return QueuePtr;
+  }
+
+  bool hasDispatchID() const {
+    return DispatchID;
+  }
+
+  bool hasKernargSegmentPtr() const {
+    return KernargSegmentPtr;
+  }
+
+  bool hasFlatScratchInit() const {
+    return FlatScratchInit;
+  }
+
+  bool hasGridWorkgroupCountX() const {
+    return GridWorkgroupCountX;
+  }
+
+  bool hasGridWorkgroupCountY() const {
+    return GridWorkgroupCountY;
+  }
+
+  bool hasGridWorkgroupCountZ() const {
+    return GridWorkgroupCountZ;
+  }
+
+  bool hasWorkGroupIDX() const {
+    return WorkGroupIDX;
+  }
+
+  bool hasWorkGroupIDY() const {
+    return WorkGroupIDY;
+  }
+
+  bool hasWorkGroupIDZ() const {
+    return WorkGroupIDZ;
+  }
+
+  bool hasWorkGroupInfo() const {
+    return WorkGroupInfo;
+  }
+
+  bool hasPrivateSegmentWaveByteOffset() const {
+    return PrivateSegmentWaveByteOffset;
+  }
+
+  bool hasWorkItemIDX() const {
+    return WorkItemIDX;
+  }
+
+  bool hasWorkItemIDY() const {
+    return WorkItemIDY;
+  }
+
+  bool hasWorkItemIDZ() const {
+    return WorkItemIDZ;
+  }
+
+  unsigned getNumUserSGPRs() const {
+    return NumUserSGPRs;
+  }
+
+  unsigned getNumPreloadedSGPRs() const {
+    return NumUserSGPRs + NumSystemSGPRs;
+  }
+
+  unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const {
+    return PrivateSegmentWaveByteOffsetSystemSGPR;
+  }
+
+  /// \brief Returns the physical register reserved for use as the resource
+  /// descriptor for scratch accesses.
+  unsigned getScratchRSrcReg() const {
+    return ScratchRSrcReg;
+  }
+
+  void setScratchRSrcReg(unsigned Reg) {
+    assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+    ScratchRSrcReg = Reg;
+  }
+
+  unsigned getScratchWaveOffsetReg() const {
+    return ScratchWaveOffsetReg;
+  }
+
+  void setScratchWaveOffsetReg(unsigned Reg) {
+    assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+    ScratchWaveOffsetReg = Reg;
+  }
+
+  bool hasSpilledSGPRs() const {
+    return HasSpilledSGPRs;
+  }
+
+  void setHasSpilledSGPRs(bool Spill = true) {
+    HasSpilledSGPRs = Spill;
+  }
+
+  bool hasSpilledVGPRs() const {
+    return HasSpilledVGPRs;
+  }
+
+  void setHasSpilledVGPRs(bool Spill = true) {
+    HasSpilledVGPRs = Spill;
+  }
 
   unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
 };
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
deleted file mode 100644
index 2cd600d..0000000
--- a/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// This pass loads scratch pointer and scratch offset into a register or a
-/// frame index which can be used anywhere in the program.  These values will
-/// be used for spilling VGPRs.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-
-using namespace llvm;
-
-namespace {
-
-class SIPrepareScratchRegs : public MachineFunctionPass {
-
-private:
-  static char ID;
-
-public:
-  SIPrepareScratchRegs() : MachineFunctionPass(ID) { }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "SI prepare scratch registers";
-  }
-
-};
-
-} // End anonymous namespace
-
-char SIPrepareScratchRegs::ID = 0;
-
-FunctionPass *llvm::createSIPrepareScratchRegs() {
-  return new SIPrepareScratchRegs();
-}
-
-bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
-  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-  MachineBasicBlock *Entry = MF.begin();
-  MachineBasicBlock::iterator I = Entry->begin();
-  DebugLoc DL = I->getDebugLoc();
-
-  // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to
-  // run this pass.
-  if (!MFI->hasSpilledVGPRs())
-    return false;
-
-  unsigned ScratchPtrPreloadReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
-  unsigned ScratchOffsetPreloadReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
-
-  if (!Entry->isLiveIn(ScratchPtrPreloadReg))
-    Entry->addLiveIn(ScratchPtrPreloadReg);
-
-  if (!Entry->isLiveIn(ScratchOffsetPreloadReg))
-    Entry->addLiveIn(ScratchOffsetPreloadReg);
-
-  // Load the scratch offset.
-  unsigned ScratchOffsetReg =
-      TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass);
-  int ScratchOffsetFI = -1;
-
-  if (ScratchOffsetReg != AMDGPU::NoRegister) {
-    // Found an SGPR to use
-    MRI.setPhysRegUsed(ScratchOffsetReg);
-    BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg)
-            .addReg(ScratchOffsetPreloadReg);
-  } else {
-    // No SGPR is available, we must spill.
-    ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4);
-    BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE))
-            .addReg(ScratchOffsetPreloadReg)
-            .addFrameIndex(ScratchOffsetFI)
-            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-            .addReg(AMDGPU::SGPR0, RegState::Undef);
-  }
-
-
-  // Now that we have the scratch pointer and offset values, we need to
-  // add them to all the SI_SPILL_V* instructions.
-
-  RegScavenger RS;
-  unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4);
-  RS.addScavengingFrameIndex(ScratchRsrcFI);
-
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
-
-    MachineBasicBlock &MBB = *BI;
-    // Add the scratch offset reg as a live-in so that the register scavenger
-    // doesn't re-use it.
-    if (!MBB.isLiveIn(ScratchOffsetReg) &&
-        ScratchOffsetReg != AMDGPU::NoRegister)
-      MBB.addLiveIn(ScratchOffsetReg);
-    RS.enterBasicBlock(&MBB);
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
-      MachineInstr &MI = *I;
-      RS.forward(I);
-      DebugLoc DL = MI.getDebugLoc();
-      if (!TII->isVGPRSpill(MI.getOpcode()))
-        continue;
-
-      // Scratch resource
-      unsigned ScratchRsrcReg =
-          RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0);
-
-      uint64_t Rsrc23 = TII->getScratchRsrcWords23();
-
-      unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
-      unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
-      unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
-      unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0)
-              .addExternalSymbol("SCRATCH_RSRC_DWORD0")
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1)
-              .addExternalSymbol("SCRATCH_RSRC_DWORD1")
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2)
-              .addImm(Rsrc23 & 0xffffffff)
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3)
-              .addImm(Rsrc23 >> 32)
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      // Scratch Offset
-      if (ScratchOffsetReg == AMDGPU::NoRegister) {
-        ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
-        BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE),
-                ScratchOffsetReg)
-                .addFrameIndex(ScratchOffsetFI)
-                .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-                .addReg(AMDGPU::SGPR0, RegState::Undef);
-      } else if (!MBB.isLiveIn(ScratchOffsetReg)) {
-        MBB.addLiveIn(ScratchOffsetReg);
-      }
-
-      if (ScratchRsrcReg == AMDGPU::NoRegister ||
-          ScratchOffsetReg == AMDGPU::NoRegister) {
-        LLVMContext &Ctx = MF.getFunction()->getContext();
-        Ctx.emitError("ran out of SGPRs for spilling VGPRs");
-        ScratchRsrcReg = AMDGPU::SGPR0;
-        ScratchOffsetReg = AMDGPU::SGPR0;
-      }
-      MI.getOperand(2).setReg(ScratchRsrcReg);
-      MI.getOperand(2).setIsKill(true);
-      MI.getOperand(2).setIsUndef(false);
-      MI.getOperand(3).setReg(ScratchOffsetReg);
-      MI.getOperand(3).setIsUndef(false);
-      MI.getOperand(3).setIsKill(false);
-      MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true));
-      MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true));
-      MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true));
-      MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true));
-    }
-  }
-  return true;
-}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index e9e8412..3cdffef 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "SIRegisterInfo.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
@@ -33,6 +32,40 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co
     Reserved.set(*R);
 }
 
+unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
+  const MachineFunction &MF) const {
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  if (ST.hasSGPRInitBug()) {
+    unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4;
+    unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
+    return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+  }
+
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    // 98/99 need to be reserved for flat_scr, and 100/101 for vcc. This is the
+    // next sgpr128 down.
+    return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
+  }
+
+  return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99;
+}
+
+unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
+  const MachineFunction &MF) const {
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  if (ST.hasSGPRInitBug()) {
+    unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5;
+    return AMDGPU::SGPR_32RegClass.getRegister(Idx);
+  }
+
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    // Next register before reservations for flat_scr and vcc.
+    return AMDGPU::SGPR97;
+  }
+
+  return AMDGPU::SGPR95;
+}
+
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
@@ -42,13 +75,22 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserveRegisterTuples(Reserved, AMDGPU::EXEC);
   reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
 
-  // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs
-  reserveRegisterTuples(Reserved, AMDGPU::VGPR254);
-  reserveRegisterTuples(Reserved, AMDGPU::VGPR255);
+  // Reserve the last 2 registers so we will always have at least 2 more that
+  // will physically contain VCC.
+  reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103);
+
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+
+  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation
+    // for VCC/FLAT_SCR.
+    reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99);
+    reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101);
+  }
 
   // Tonga and Iceland can only allocate a fixed number of SGPRs due
   // to a hw bug.
-  if (MF.getSubtarget<AMDGPUSubtarget>().hasSGPRInitBug()) {
+  if (ST.hasSGPRInitBug()) {
     unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
     // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs).
     // Assume XNACK_MASK is unused.
@@ -60,34 +102,57 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     }
   }
 
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+  if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
+    // Reserve 1 SGPR for scratch wave offset in case we need to spill.
+    reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
+  }
+
+  unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+  if (ScratchRSrcReg != AMDGPU::NoRegister) {
+    // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
+    // to spill.
+    // TODO: May need to reserve a VGPR if doing LDS spilling.
+    reserveRegisterTuples(Reserved, ScratchRSrcReg);
+    assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
+  }
+
   return Reserved;
 }
 
 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
                                                 unsigned Idx) const {
-
   const AMDGPUSubtarget &STI = MF.getSubtarget<AMDGPUSubtarget>();
   // FIXME: We should adjust the max number of waves based on LDS size.
   unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(),
                                           STI.getMaxWavesPerCU());
   unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU());
 
+  unsigned VSLimit = SGPRLimit + VGPRLimit;
+
   for (regclass_iterator I = regclass_begin(), E = regclass_end();
        I != E; ++I) {
+    const TargetRegisterClass *RC = *I;
 
-    unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1);
+    unsigned NumSubRegs = std::max((int)RC->getSize() / 4, 1);
     unsigned Limit;
 
-    if (isSGPRClass(*I)) {
+    if (isPseudoRegClass(RC)) {
+      // FIXME: This is a hack. We should never be considering the pressure of
+      // these since no virtual register should ever have this class.
+      Limit = VSLimit;
+    } else if (isSGPRClass(RC)) {
       Limit = SGPRLimit / NumSubRegs;
     } else {
       Limit = VGPRLimit / NumSubRegs;
     }
 
-    const int *Sets = getRegClassPressureSets(*I);
+    const int *Sets = getRegClassPressureSets(RC);
     assert(Sets);
     for (unsigned i = 0; Sets[i] != -1; ++i) {
-	    if (Sets[i] == (int)Idx)
+      if (Sets[i] == (int)Idx)
         return Limit;
     }
   }
@@ -174,17 +239,17 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
     unsigned SubReg = NumSubRegs > 1 ?
         getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
         Value;
-    bool IsKill = (i == e - 1);
 
     BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
-            .addReg(SubReg, getDefRegState(IsLoad))
-            .addReg(ScratchRsrcReg, getKillRegState(IsKill))
-            .addReg(SOffset)
-            .addImm(Offset)
-            .addImm(0) // glc
-            .addImm(0) // slc
-            .addImm(0) // tfe
-            .addReg(Value, RegState::Implicit | getDefRegState(IsLoad));
+      .addReg(SubReg, getDefRegState(IsLoad))
+      .addReg(ScratchRsrcReg)
+      .addReg(SOffset)
+      .addImm(Offset)
+      .addImm(0) // glc
+      .addImm(0) // slc
+      .addImm(0) // tfe
+      .addReg(Value, RegState::Implicit | getDefRegState(IsLoad))
+      .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
   }
 }
 
@@ -228,6 +293,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                 .addReg(SubReg)
                 .addImm(Spill.Lane);
 
+        // FIXME: Since this spills to another register instead of an actual
+        // frame index, we should delete the frame index when all references to
+        // it are fixed.
       }
       MI->eraseFromParent();
       break;
@@ -263,16 +331,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
       // TODO: only do this when it is needed
       switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) {
       case AMDGPUSubtarget::SOUTHERN_ISLANDS:
-        // "VALU writes SGPR" -> "SMRD reads that SGPR" needs "S_NOP 3" on SI
-        TII->insertNOPs(MI, 3);
+        // "VALU writes SGPR" -> "SMRD reads that SGPR" needs 4 wait states
+        // ("S_NOP 3") on SI
+        TII->insertWaitStates(MI, 4);
         break;
       case AMDGPUSubtarget::SEA_ISLANDS:
         break;
       default: // VOLCANIC_ISLANDS and later
-        // "VALU writes SGPR -> VMEM reads that SGPR" needs "S_NOP 4" on VI
-        // and later. This also applies to VALUs which write VCC, but we're
-        // unlikely to see VMEM use VCC.
-        TII->insertNOPs(MI, 4);
+        // "VALU writes SGPR -> VMEM reads that SGPR" needs 5 wait states
+        // ("S_NOP 4") on VI and later. This also applies to VALUs which write
+        // VCC, but we're unlikely to see VMEM use VCC.
+        TII->insertWaitStates(MI, 5);
       }
 
       MI->eraseFromParent();
@@ -322,22 +391,16 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
   }
 }
 
-const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
-                                                                   MVT VT) const {
-  switch(VT.SimpleTy) {
-    default:
-    case MVT::i32: return &AMDGPU::VGPR_32RegClass;
-  }
-}
-
 unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
   return getEncodingValue(Reg) & 0xff;
 }
 
+// FIXME: This is very slow. It might be worth creating a map from physreg to
+// register class.
 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
   assert(!TargetRegisterInfo::isVirtualRegister(Reg));
 
-  static const TargetRegisterClass *BaseClasses[] = {
+  static const TargetRegisterClass *const BaseClasses[] = {
     &AMDGPU::VGPR_32RegClass,
     &AMDGPU::SReg_32RegClass,
     &AMDGPU::VReg_64RegClass,
@@ -359,33 +422,45 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
   return nullptr;
 }
 
+// TODO: It might be helpful to have some target specific flags in
+// TargetRegisterClass to mark which classes are VGPRs to make this trivial.
 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
-  return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) ||
-         getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) ||
-         getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) ||
-         getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) ||
-         getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) ||
-         getCommonSubClass(&AMDGPU::VReg_512RegClass, RC);
+  switch (RC->getSize()) {
+  case 4:
+    return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
+  case 8:
+    return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr;
+  case 12:
+    return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
+  case 16:
+    return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
+  case 32:
+    return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
+  case 64:
+    return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
+  default:
+    llvm_unreachable("Invalid register class size");
+  }
 }
 
 const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
                                          const TargetRegisterClass *SRC) const {
-    if (hasVGPRs(SRC)) {
-      return SRC;
-    } else if (SRC == &AMDGPU::SCCRegRegClass) {
-      return &AMDGPU::VCCRegRegClass;
-    } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) {
-      return &AMDGPU::VGPR_32RegClass;
-    } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) {
-      return &AMDGPU::VReg_64RegClass;
-    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) {
-      return &AMDGPU::VReg_128RegClass;
-    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_256RegClass)) {
-      return &AMDGPU::VReg_256RegClass;
-    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) {
-      return &AMDGPU::VReg_512RegClass;
-    }
-    return nullptr;
+  switch (SRC->getSize()) {
+  case 4:
+    return &AMDGPU::VGPR_32RegClass;
+  case 8:
+    return &AMDGPU::VReg_64RegClass;
+  case 12:
+    return &AMDGPU::VReg_96RegClass;
+  case 16:
+    return &AMDGPU::VReg_128RegClass;
+  case 32:
+    return &AMDGPU::VReg_256RegClass;
+  case 64:
+    return &AMDGPU::VReg_512RegClass;
+  default:
+    llvm_unreachable("Invalid register class size");
+  }
 }
 
 const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
@@ -402,6 +477,30 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
   }
 }
 
+bool SIRegisterInfo::shouldRewriteCopySrc(
+  const TargetRegisterClass *DefRC,
+  unsigned DefSubReg,
+  const TargetRegisterClass *SrcRC,
+  unsigned SrcSubReg) const {
+  // We want to prefer the smallest register class possible, so we don't want to
+  // stop and rewrite on anything that looks like a subregister
+  // extract. Operations mostly don't care about the super register class, so we
+  // only want to stop on the most basic of copies between the smae register
+  // class.
+  //
+  // e.g. if we have something like
+  // vreg0 = ...
+  // vreg1 = ...
+  // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2
+  // vreg3 = COPY vreg2, sub0
+  //
+  // We want to look through the COPY to find:
+  //  => vreg3 = COPY vreg0
+
+  // Plain copy.
+  return getCommonSubClass(DefRC, SrcRC) != nullptr;
+}
+
 unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
                                           const TargetRegisterClass *SubRC,
                                           unsigned Channel) const {
@@ -462,30 +561,47 @@ bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
   return OpType == AMDGPU::OPERAND_REG_INLINE_C;
 }
 
+// FIXME: Most of these are flexible with HSA and we don't need to reserve them
+// as input registers if unused. Whether the dispatch ptr is necessary should be
+// easy to detect from used intrinsics. Scratch setup is harder to know.
 unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
                                            enum PreloadedValue Value) const {
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  (void)ST;
   switch (Value) {
-  case SIRegisterInfo::TGID_X:
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0);
-  case SIRegisterInfo::TGID_Y:
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1);
-  case SIRegisterInfo::TGID_Z:
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
-  case SIRegisterInfo::SCRATCH_WAVE_OFFSET:
-    if (MFI->getShaderType() != ShaderType::COMPUTE)
-      return MFI->ScratchOffsetReg;
-    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
-  case SIRegisterInfo::SCRATCH_PTR:
-    return AMDGPU::SGPR2_SGPR3;
-  case SIRegisterInfo::INPUT_PTR:
-    return AMDGPU::SGPR0_SGPR1;
-  case SIRegisterInfo::TIDIG_X:
+  case SIRegisterInfo::WORKGROUP_ID_X:
+    assert(MFI->hasWorkGroupIDX());
+    return MFI->WorkGroupIDXSystemSGPR;
+  case SIRegisterInfo::WORKGROUP_ID_Y:
+    assert(MFI->hasWorkGroupIDY());
+    return MFI->WorkGroupIDYSystemSGPR;
+  case SIRegisterInfo::WORKGROUP_ID_Z:
+    assert(MFI->hasWorkGroupIDZ());
+    return MFI->WorkGroupIDZSystemSGPR;
+  case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
+    return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
+  case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
+    assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations");
+    assert(MFI->hasPrivateSegmentBuffer());
+    return MFI->PrivateSegmentBufferUserSGPR;
+  case SIRegisterInfo::KERNARG_SEGMENT_PTR:
+    assert(MFI->hasKernargSegmentPtr());
+    return MFI->KernargSegmentPtrUserSGPR;
+  case SIRegisterInfo::DISPATCH_PTR:
+    assert(MFI->hasDispatchPtr());
+    return MFI->DispatchPtrUserSGPR;
+  case SIRegisterInfo::QUEUE_PTR:
+    llvm_unreachable("not implemented");
+  case SIRegisterInfo::WORKITEM_ID_X:
+    assert(MFI->hasWorkItemIDX());
     return AMDGPU::VGPR0;
-  case SIRegisterInfo::TIDIG_Y:
+  case SIRegisterInfo::WORKITEM_ID_Y:
+    assert(MFI->hasWorkItemIDY());
     return AMDGPU::VGPR1;
-  case SIRegisterInfo::TIDIG_Z:
+  case SIRegisterInfo::WORKITEM_ID_Z:
+    assert(MFI->hasWorkItemIDZ());
     return AMDGPU::VGPR2;
   }
   llvm_unreachable("unexpected preloaded value type");
@@ -496,12 +612,9 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
 //         AMDGPU::NoRegister.
 unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
                                            const TargetRegisterClass *RC) const {
-
-  for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
-       I != E; ++I) {
-    if (!MRI.isPhysRegUsed(*I))
-      return *I;
-  }
+  for (unsigned Reg : *RC)
+    if (!MRI.isPhysRegUsed(Reg))
+      return Reg;
   return AMDGPU::NoRegister;
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 7da6de2..1795237 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -18,6 +18,7 @@
 
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 
 namespace llvm {
@@ -29,6 +30,15 @@ private:
 public:
   SIRegisterInfo();
 
+  /// Return the end register initially reserved for the scratch buffer in case
+  /// spilling is needed.
+  unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
+
+  /// Return the end register initially reserved for the scratch wave offset in
+  /// case spilling is needed.
+  unsigned reservedPrivateSegmentWaveByteOffsetReg(
+    const MachineFunction &MF) const;
+
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
   unsigned getRegPressureSetLimit(const MachineFunction &MF,
@@ -40,10 +50,6 @@ public:
                            unsigned FIOperandNum,
                            RegScavenger *RS) const override;
 
-  /// \brief get the register class of the specified type to use in the
-  /// CFGStructurizer
-  const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
-
   unsigned getHWRegIndex(unsigned Reg) const override;
 
   /// \brief Return the 'base' register class for this register.
@@ -52,23 +58,30 @@ public:
 
   /// \returns true if this class contains only SGPR registers
   bool isSGPRClass(const TargetRegisterClass *RC) const {
-    if (!RC)
-      return false;
-
     return !hasVGPRs(RC);
   }
 
   /// \returns true if this class ID contains only SGPR registers
   bool isSGPRClassID(unsigned RCID) const {
-    if (static_cast<int>(RCID) == -1)
-      return false;
-
     return isSGPRClass(getRegClass(RCID));
   }
 
+  bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      return isSGPRClass(MRI.getRegClass(Reg));
+    return getPhysRegClass(Reg);
+  }
+
   /// \returns true if this class contains VGPR registers.
   bool hasVGPRs(const TargetRegisterClass *RC) const;
 
+  /// returns true if this is a pseudoregister class combination of VGPRs and
+  /// SGPRs for operand modeling. FIXME: We should set isAllocatable = 0 on
+  /// them.
+  static bool isPseudoRegClass(const TargetRegisterClass *RC) {
+    return RC == &AMDGPU::VS_32RegClass || RC == &AMDGPU::VS_64RegClass;
+  }
+
   /// \returns A VGPR reg class with the same width as \p SRC
   const TargetRegisterClass *getEquivalentVGPRClass(
                                           const TargetRegisterClass *SRC) const;
@@ -79,6 +92,11 @@ public:
   const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
                                             unsigned SubIdx) const;
 
+  bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+                            unsigned DefSubReg,
+                            const TargetRegisterClass *SrcRC,
+                            unsigned SrcSubReg) const override;
+
   /// \p Channel This is the register channel (e.g. a value from 0-16), not the
   ///            SubReg index.
   /// \returns The sub-register of Reg that is in Channel.
@@ -91,19 +109,25 @@ public:
 
   /// \returns True if operands defined with this operand type can accept
   /// an inline constant. i.e. An integer value in the range (-16, 64) or
-  /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. 
+  /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
   bool opCanUseInlineConstant(unsigned OpType) const;
 
   enum PreloadedValue {
-    TGID_X,
-    TGID_Y,
-    TGID_Z,
-    SCRATCH_WAVE_OFFSET,
-    SCRATCH_PTR,
-    INPUT_PTR,
-    TIDIG_X,
-    TIDIG_Y,
-    TIDIG_Z
+    // SGPRS:
+    PRIVATE_SEGMENT_BUFFER =  0,
+    DISPATCH_PTR        =  1,
+    QUEUE_PTR           =  2,
+    KERNARG_SEGMENT_PTR =  3,
+    WORKGROUP_ID_X      = 10,
+    WORKGROUP_ID_Y      = 11,
+    WORKGROUP_ID_Z      = 12,
+    PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
+
+    // VGPRS:
+    FIRST_VGPR_VALUE    = 15,
+    WORKITEM_ID_X       = FIRST_VGPR_VALUE,
+    WORKITEM_ID_Y       = 16,
+    WORKITEM_ID_Z       = 17
   };
 
   /// \brief Returns the physical register that \p Value is stored in.
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 2a9017f..bfaf937 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -10,10 +10,13 @@
 //===----------------------------------------------------------------------===//
 //  Declarations that describe the SI registers
 //===----------------------------------------------------------------------===//
-
-class SIReg <string n, bits<16> encoding = 0> : Register<n> {
+class SIReg <string n, bits<16> regIdx = 0> : Register<n>,
+  DwarfRegNum<[!cast<int>(HWEncoding)]> {
   let Namespace = "AMDGPU";
-  let HWEncoding = encoding;
+
+  // This is the not yet the complete register encoding. An additional
+  // bit is set for VGPRs.
+  let HWEncoding = regIdx;
 }
 
 // Special Registers
@@ -21,7 +24,8 @@ def VCC_LO : SIReg<"vcc_lo", 106>;
 def VCC_HI : SIReg<"vcc_hi", 107>;
 
 // VCC for 64-bit instructions
-def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> {
+def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
+          DwarfRegAlias<VCC_LO> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
   let HWEncoding = 106;
@@ -30,7 +34,8 @@ def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> {
 def EXEC_LO : SIReg<"exec_lo", 126>;
 def EXEC_HI : SIReg<"exec_hi", 127>;
 
-def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> {
+def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>,
+           DwarfRegAlias<EXEC_LO> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
   let HWEncoding = 126;
@@ -39,18 +44,29 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> {
 def SCC : SIReg<"scc", 253>;
 def M0 : SIReg <"m0", 124>;
 
-def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes.
-def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes.
+multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
+  def _ci : SIReg<n, ci_e>;
+  def _vi : SIReg<n, vi_e>;
+  def "" : SIReg<"", 0>;
+}
 
-// Pair to indicate location of scratch space for flat accesses.
-def FLAT_SCR : RegisterWithSubRegs <"flat_scr", [FLAT_SCR_LO, FLAT_SCR_HI]> {
+class FlatReg <Register lo, Register hi, bits<16> encoding> :
+    RegisterWithSubRegs<"flat_scratch", [lo, hi]>,
+    DwarfRegAlias<lo> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
-  let HWEncoding = 104;
+  let HWEncoding = encoding;
 }
 
+defm FLAT_SCR_LO : FLAT_SCR_LOHI_m<"flat_scratch_lo", 104, 102>; // Offset in units of 256-bytes.
+defm FLAT_SCR_HI : FLAT_SCR_LOHI_m<"flat_scratch_hi", 105, 103>; // Size is the per-thread scratch size, in bytes.
+
+def FLAT_SCR_ci : FlatReg<FLAT_SCR_LO_ci, FLAT_SCR_HI_ci, 104>;
+def FLAT_SCR_vi : FlatReg<FLAT_SCR_LO_vi, FLAT_SCR_HI_vi, 102>;
+def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>;
+
 // SGPR registers
-foreach Index = 0-101 in {
+foreach Index = 0-103 in {
   def SGPR#Index : SIReg <"SGPR"#Index, Index>;
 }
 
@@ -65,25 +81,27 @@ foreach Index = 0-255 in {
 //  Groupings using register classes and tuples
 //===----------------------------------------------------------------------===//
 
+// TODO: Do we need to set DwarfRegAlias on register tuples?
+
 // SGPR 32-bit registers
 def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
-                            (add (sequence "SGPR%u", 0, 101))>;
+                            (add (sequence "SGPR%u", 0, 103))>;
 
 // SGPR 64-bit registers
 def SGPR_64Regs : RegisterTuples<[sub0, sub1],
-                             [(add (decimate (trunc SGPR_32, 101), 2)),
+                             [(add (decimate SGPR_32, 2)),
                               (add (decimate (shl SGPR_32, 1), 2))]>;
 
 // SGPR 128-bit registers
 def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
-                              [(add (decimate (trunc SGPR_32, 99), 4)),
+                              [(add (decimate SGPR_32, 4)),
                                (add (decimate (shl SGPR_32, 1), 4)),
                                (add (decimate (shl SGPR_32, 2), 4)),
                                (add (decimate (shl SGPR_32, 3), 4))]>;
 
 // SGPR 256-bit registers
 def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
-                              [(add (decimate (trunc SGPR_32, 95), 4)),
+                              [(add (decimate SGPR_32, 4)),
                                (add (decimate (shl SGPR_32, 1), 4)),
                                (add (decimate (shl SGPR_32, 2), 4)),
                                (add (decimate (shl SGPR_32, 3), 4)),
@@ -95,7 +113,7 @@ def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
 // SGPR 512-bit registers
 def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
                                sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
-                              [(add (decimate (trunc SGPR_32, 87), 4)),
+                              [(add (decimate SGPR_32, 4)),
                                (add (decimate (shl SGPR_32, 1), 4)),
                                (add (decimate (shl SGPR_32, 2), 4)),
                                (add (decimate (shl SGPR_32, 3), 4)),
@@ -174,44 +192,57 @@ class RegImmMatcher<string name> : AsmOperandClass {
   let RenderMethod = "addRegOrImmOperands";
 }
 
-// Special register classes for predicates and the M0 register
-def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)> {
-  let CopyCost = -1; // Theoretically it is possible to read from SCC,
-                     // but it should never be necessary.
-}
-
-def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>;
-def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>;
-
 // Register class for all scalar registers (SGPRs + Special Registers)
 def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
   (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)
 >;
 
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 64, (add SGPR_64Regs)>;
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)>;
 
-def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 64,
-  (add SGPR_64, VCCReg, EXECReg, FLAT_SCR)
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
+  (add SGPR_64, VCC, EXEC, FLAT_SCR)
 >;
 
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>;
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128)> {
+  // Requires 2 s_mov_b64 to copy
+  let CopyCost = 2;
+}
 
-def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>;
+def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add SGPR_256)> {
+  // Requires 4 s_mov_b64 to copy
+  let CopyCost = 4;
+}
 
-def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>;
+def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> {
+  // Requires 8 s_mov_b64 to copy
+  let CopyCost = 8;
+}
 
 // Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>;
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> {
+  // Requires 2 v_mov_b32 to copy
+  let CopyCost = 2;
+}
 
-def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> {
+def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
   let Size = 96;
+
+  // Requires 3 v_mov_b32 to copy
+  let CopyCost = 3;
 }
 
-def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>;
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> {
+  // Requires 4 v_mov_b32 to copy
+  let CopyCost = 4;
+}
 
-def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>;
+def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add VGPR_256)> {
+  let CopyCost = 8;
+}
 
-def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
+def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> {
+  let CopyCost = 16;
+}
 
 def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
   let Size = 32;
@@ -253,7 +284,9 @@ def SCSrc_32 : RegInlineOperand<SReg_32> {
 
 def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>;
 
-def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
+def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> {
+  let CopyCost = 2;
+}
 
 def VSrc_32 : RegisterOperand<VS_32> {
   let OperandNamespace = "AMDGPU";
@@ -282,3 +315,13 @@ def VCSrc_64 : RegisterOperand<VS_64> {
   let OperandType = "OPERAND_REG_INLINE_C";
   let ParserMatchClass = RegImmMatcher<"VCSrc64">;
 }
+
+//===----------------------------------------------------------------------===//
+//  SCSrc_* Operands with an SGPR or an inline constant
+//===----------------------------------------------------------------------===//
+
+def SCSrc_64 : RegisterOperand<SReg_64> {
+  let OperandNamespace = "AMDGPU";
+  let OperandType = "OPERAND_REG_INLINE_C";
+  let ParserMatchClass = RegImmMatcher<"SCSrc64">;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
index 9b1f676..cd77e51 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -17,16 +17,28 @@ def WriteLDS    : SchedWrite;
 def WriteSALU   : SchedWrite;
 def WriteSMEM   : SchedWrite;
 def WriteVMEM   : SchedWrite;
+def WriteBarrier : SchedWrite;
 
 // Vector ALU instructions
 def Write32Bit         : SchedWrite;
 def WriteQuarterRate32 : SchedWrite;
+def WriteFullOrQuarterRate32 : SchedWrite;
 
 def WriteFloatFMA   : SchedWrite;
 
-def WriteDouble     : SchedWrite;
+// Slow quarter rate f64 instruction.
+def WriteDouble : SchedWrite;
+
+// half rate f64 instruction (same as v_add_f64)
 def WriteDoubleAdd  : SchedWrite;
 
+// Half rate 64-bit instructions.
+def Write64Bit : SchedWrite;
+
+// FIXME: Should there be a class for instructions which are VALU
+// instructions and have VALU rates, but write to the SALU (i.e. VOPC
+// instructions)
+
 def SIFullSpeedModel : SchedMachineModel;
 def SIQuarterSpeedModel : SchedMachineModel;
 
@@ -53,7 +65,7 @@ class HWVALUWriteRes<SchedWrite write, int latency> :
 
 
 // The latency numbers are taken from AMD Accelerated Parallel Processing
-// guide.  They may not be acurate.
+// guide. They may not be accurate.
 
 // The latency values are 1 / (operations / cycle) / 4.
 multiclass SICommonWriteRes {
@@ -64,8 +76,10 @@ multiclass SICommonWriteRes {
   def : HWWriteRes<WriteSALU,    [HWSALU],     1>;
   def : HWWriteRes<WriteSMEM,    [HWLGKM],    10>; // XXX: Guessed ???
   def : HWWriteRes<WriteVMEM,    [HWVMEM],   450>; // 300 - 600
+  def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ???
 
   def : HWVALUWriteRes<Write32Bit,         1>;
+  def : HWVALUWriteRes<Write64Bit,         2>;
   def : HWVALUWriteRes<WriteQuarterRate32, 4>;
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 5d00bdd..4f0913f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -141,8 +141,7 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
   if (!MRI.isSSA())
     return;
 
-  assert(TII->isVOP1(MI.getOpcode()) || TII->isVOP2(MI.getOpcode()) ||
-         TII->isVOPC(MI.getOpcode()));
+  assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
 
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
@@ -187,6 +186,21 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
 
 }
 
+// Copy MachineOperand with all flags except setting it as implicit.
+static MachineOperand copyRegOperandAsImplicit(const MachineOperand &Orig) {
+  assert(!Orig.isImplicit());
+  return MachineOperand::CreateReg(Orig.getReg(),
+                                   Orig.isDef(),
+                                   true,
+                                   Orig.isKill(),
+                                   Orig.isDead(),
+                                   Orig.isUndef(),
+                                   Orig.isEarlyClobber(),
+                                   Orig.getSubReg(),
+                                   Orig.isDebug(),
+                                   Orig.isInternalRead());
+}
+
 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const SIInstrInfo *TII =
@@ -236,14 +250,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       if (TII->isVOPC(Op32)) {
         unsigned DstReg = MI.getOperand(0).getReg();
         if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
-          // VOPC instructions can only write to the VCC register.  We can't
-          // force them to use VCC here, because the register allocator has
-          // trouble with sequences like this, which cause the allocator to run
-          // out of registers if vreg0 and vreg1 belong to the VCCReg register
-          // class:
-          // vreg0 = VOPC;
-          // vreg1 = VOPC;
-          // S_AND_B64 vreg0, vreg1
+          // VOPC instructions can only write to the VCC register. We can't
+          // force them to use VCC here, because this is only one register and
+          // cannot deal with sequences which would require multiple copies of
+          // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
           //
           // So, instead of forcing the instruction to write to VCC, we provide
           // a hint to the register allocator to use VCC and then we we will run
@@ -272,13 +282,22 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       }
 
       // We can shrink this instruction
-      DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';);
+      DEBUG(dbgs() << "Shrinking " << MI);
 
       MachineInstrBuilder Inst32 =
           BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
 
-      // dst
-      Inst32.addOperand(MI.getOperand(0));
+      // Add the dst operand if the 32-bit encoding also has an explicit $dst.
+      // For VOPC instructions, this is replaced by an implicit def of vcc.
+      int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::dst);
+      if (Op32DstIdx != -1) {
+        // dst
+        Inst32.addOperand(MI.getOperand(0));
+      } else {
+        assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
+               "Unexpected case");
+      }
+
 
       Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
 
@@ -288,9 +307,19 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         Inst32.addOperand(*Src1);
 
       const MachineOperand *Src2 =
-          TII->getNamedOperand(MI, AMDGPU::OpName::src2);
-      if (Src2)
-        Inst32.addOperand(*Src2);
+        TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+      if (Src2) {
+        int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
+        if (Op32Src2Idx != -1) {
+          Inst32.addOperand(*Src2);
+        } else {
+          // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
+          // replaced with an implicit read of vcc.
+          assert(Src2->getReg() == AMDGPU::VCC &&
+                 "Unexpected missing register operand");
+          Inst32.addOperand(copyRegOperandAsImplicit(*Src2));
+        }
+      }
 
       ++NumInstructionsShrunk;
       MI.eraseFromParent();
diff --git a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
index 591ce85..dbdc76b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
@@ -22,6 +22,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 
@@ -61,14 +62,7 @@ bool SITypeRewriter::doInitialization(Module &M) {
 }
 
 bool SITypeRewriter::runOnFunction(Function &F) {
-  Attribute A = F.getFnAttribute("ShaderType");
-
-  unsigned ShaderType = ShaderType::COMPUTE;
-  if (A.isStringAttribute()) {
-    StringRef Str = A.getValueAsString();
-    Str.getAsInteger(0, ShaderType);
-  }
-  if (ShaderType == ShaderType::COMPUTE)
+  if (AMDGPU::getShaderType(F) == ShaderType::COMPUTE)
     return false;
 
   visit(F);
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index b76b400..add415e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -7,12 +7,23 @@
 //
 //===----------------------------------------------------------------------===//
 #include "AMDGPUBaseInfo.h"
+#include "AMDGPU.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/SubtargetFeature.h"
 
 #define GET_SUBTARGETINFO_ENUM
 #include "AMDGPUGenSubtargetInfo.inc"
 #undef GET_SUBTARGETINFO_ENUM
 
+#define GET_REGINFO_ENUM
+#include "AMDGPUGenRegisterInfo.inc"
+#undef GET_REGINFO_ENUM
+
 namespace llvm {
 namespace AMDGPU {
 
@@ -56,5 +67,91 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
   Header.private_segment_alignment = 4;
 }
 
+MCSection *getHSATextSection(MCContext &Ctx) {
+  return Ctx.getELFSection(".hsatext", ELF::SHT_PROGBITS,
+                           ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                           ELF::SHF_EXECINSTR |
+                           ELF::SHF_AMDGPU_HSA_AGENT |
+                           ELF::SHF_AMDGPU_HSA_CODE);
+}
+
+MCSection *getHSADataGlobalAgentSection(MCContext &Ctx) {
+  return Ctx.getELFSection(".hsadata_global_agent", ELF::SHT_PROGBITS,
+                           ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                           ELF::SHF_AMDGPU_HSA_GLOBAL |
+                           ELF::SHF_AMDGPU_HSA_AGENT);
+}
+
+MCSection *getHSADataGlobalProgramSection(MCContext &Ctx) {
+  return  Ctx.getELFSection(".hsadata_global_program", ELF::SHT_PROGBITS,
+                            ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                            ELF::SHF_AMDGPU_HSA_GLOBAL);
+}
+
+MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx) {
+  return Ctx.getELFSection(".hsarodata_readonly_agent", ELF::SHT_PROGBITS,
+                           ELF::SHF_ALLOC | ELF::SHF_AMDGPU_HSA_READONLY |
+                           ELF::SHF_AMDGPU_HSA_AGENT);
+}
+
+bool isGroupSegment(const GlobalValue *GV) {
+  return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}
+
+bool isGlobalSegment(const GlobalValue *GV) {
+  return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+}
+
+bool isReadOnlySegment(const GlobalValue *GV) {
+  return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+}
+
+static const char ShaderTypeAttribute[] = "ShaderType";
+
+unsigned getShaderType(const Function &F) {
+  Attribute A = F.getFnAttribute(ShaderTypeAttribute);
+  unsigned ShaderType = ShaderType::COMPUTE;
+
+  if (A.isStringAttribute()) {
+    StringRef Str = A.getValueAsString();
+    if (Str.getAsInteger(0, ShaderType)) {
+      LLVMContext &Ctx = F.getContext();
+      Ctx.emitError("can't parse shader type");
+    }
+  }
+  return ShaderType;
+}
+
+bool isSI(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands];
+}
+
+bool isCI(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureSeaIslands];
+}
+
+bool isVI(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
+}
+
+unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
+
+  switch(Reg) {
+  default: break;
+  case AMDGPU::FLAT_SCR:
+    assert(!isSI(STI));
+    return isCI(STI) ? AMDGPU::FLAT_SCR_ci : AMDGPU::FLAT_SCR_vi;
+
+  case AMDGPU::FLAT_SCR_LO:
+    assert(!isSI(STI));
+    return isCI(STI) ? AMDGPU::FLAT_SCR_LO_ci : AMDGPU::FLAT_SCR_LO_vi;
+
+  case AMDGPU::FLAT_SCR_HI:
+    assert(!isSI(STI));
+    return isCI(STI) ? AMDGPU::FLAT_SCR_HI_ci : AMDGPU::FLAT_SCR_HI_vi;
+  }
+  return Reg;
+}
+
 } // End namespace AMDGPU
 } // End namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f57028c..19419a2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -15,6 +15,11 @@
 namespace llvm {
 
 class FeatureBitset;
+class Function;
+class GlobalValue;
+class MCContext;
+class MCSection;
+class MCSubtargetInfo;
 
 namespace AMDGPU {
 
@@ -27,6 +32,27 @@ struct IsaVersion {
 IsaVersion getIsaVersion(const FeatureBitset &Features);
 void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
                                const FeatureBitset &Features);
+MCSection *getHSATextSection(MCContext &Ctx);
+
+MCSection *getHSADataGlobalAgentSection(MCContext &Ctx);
+
+MCSection *getHSADataGlobalProgramSection(MCContext &Ctx);
+
+MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx);
+
+bool isGroupSegment(const GlobalValue *GV);
+bool isGlobalSegment(const GlobalValue *GV);
+bool isReadOnlySegment(const GlobalValue *GV);
+
+unsigned getShaderType(const Function &F);
+
+bool isSI(const MCSubtargetInfo &STI);
+bool isCI(const MCSubtargetInfo &STI);
+bool isVI(const MCSubtargetInfo &STI);
+
+/// If \p Reg is a pseudo reg, return the correct hardware register given
+/// \p STI otherwise return \p Reg.
+unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
 
 } // end namespace AMDGPU
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td
index aca4673..20a026a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td
@@ -73,8 +73,8 @@ defm V_MIN_I16 : VOP2Inst <vop2<0,0x32>, "v_min_i16", VOP_I16_I16_I16>;
 } // End isCommutable = 1
 defm V_LDEXP_F16 : VOP2Inst <vop2<0,0x33>, "v_ldexp_f16", VOP_F16_F16_I16>;
 
-// Aliases to simplify matching of floating-pint instructions that are VOP2 on
-// SI and VOP3 on VI.
+// Aliases to simplify matching of floating-point instructions that
+// are VOP2 on SI and VOP3 on VI.
 
 class SI2_VI3Alias <string name, Instruction inst> : InstAlias <
   name#" $dst, $src0, $src1",
@@ -89,60 +89,15 @@ def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>;
 def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>;
 def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>;
 
-} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI
-
 //===----------------------------------------------------------------------===//
-// SMEM Patterns
+// SMEM Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [isVI] in {
+def S_DCACHE_WB : SMEM_Inval <0x21,
+  "s_dcache_wb", int_amdgcn_s_dcache_wb>;
 
-// 1. Offset as 20bit DWORD immediate
-def : Pat <
-  (SIload_constant v4i32:$sbase, IMM20bit:$offset),
-  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
->;
-
-// Patterns for global loads with no offset
-class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
-  (vt (node i64:$addr)),
-  (inst $addr, 0, 0, 0)
->;
-
-def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>;
-
-class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
-  (node vt:$data, i64:$addr),
-  (inst $data, $addr, 0, 0, 0)
->;
-
-def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>;
-def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>;
-
-class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
-  (vt (node i64:$addr, vt:$data)),
-  (inst $addr, $data, 0, 0)
->;
-
-def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
+def S_DCACHE_WB_VOL : SMEM_Inval <0x23,
+  "s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>;
 
+} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI
 
-} // End Predicates = [isVI]
author	dim <dim@FreeBSD.org>	2015-12-30 13:13:10 +0000
committer	dim <dim@FreeBSD.org>	2015-12-30 13:13:10 +0000
commit	9b5bf5c4f53d65d6a48722d7410ed7cb15f5ba3a (patch)
tree	b466a4817f79516eb1df8eae92bccf62ecc84003 /contrib/llvm/lib/Target/AMDGPU
parent	f09a28d1de99fda4f5517fb12670fc36552f4927 (diff)
parent	e194cd6d03d91631334d9d5e55b506036f423cc8 (diff)
download	FreeBSD-src-9b5bf5c4f53d65d6a48722d7410ed7cb15f5ba3a.zip FreeBSD-src-9b5bf5c4f53d65d6a48722d7410ed7cb15f5ba3a.tar.gz