15 files changed, 453 insertions, 284 deletions
diff --git a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
index ac92df9..4594c22 100644
--- a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
@@ -85,7 +85,7 @@ void NVPTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     O << markup("<imm:") << formatImm(Op.getImm()) << markup(">");
   } else {
     assert(Op.isExpr() && "Unknown operand kind in printOperand");
-    O << *Op.getExpr();
+    Op.getExpr()->print(O, &MAI);
   }
 }
 
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index b9df3d1..ef36c13 100644
--- a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -25,8 +25,7 @@ static cl::opt<bool> CompileForDebugging("debug-compile",
 
 void NVPTXMCAsmInfo::anchor() {}
 
-NVPTXMCAsmInfo::NVPTXMCAsmInfo(StringRef TT) {
-  Triple TheTriple(TT);
+NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple) {
   if (TheTriple.getArch() == Triple::nvptx64) {
     PointerSize = CalleeSaveStackSlotSize = 8;
   }
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
index c324286..b432e06 100644
--- a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -18,12 +18,12 @@
 
 namespace llvm {
 class Target;
-class StringRef;
+class Triple;
 
 class NVPTXMCAsmInfo : public MCAsmInfo {
   virtual void anchor();
 public:
-  explicit NVPTXMCAsmInfo(StringRef TT);
+  explicit NVPTXMCAsmInfo(const Triple &TheTriple);
 };
 } // namespace llvm
 
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.h b/contrib/llvm/lib/Target/NVPTX/NVPTX.h
index 382525d..477b0ba 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.h
@@ -69,7 +69,7 @@ ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping);
 MachineFunctionPass *createNVPTXPrologEpilogPass();
 MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
 FunctionPass *createNVPTXImageOptimizerPass();
-FunctionPass *createNVPTXLowerStructArgsPass();
+FunctionPass *createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM);
 
 bool isImageOrSamplerVal(const Value *, const Module *);
 
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 3bbea40..298b992 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -266,7 +266,7 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
     MCOp = MCOperand::createImm(MO.getImm());
     break;
   case MachineOperand::MO_MachineBasicBlock:
-    MCOp = MCOperand::createExpr(MCSymbolRefExpr::Create(
+    MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
         MO.getMBB()->getSymbol(), OutContext));
     break;
   case MachineOperand::MO_ExternalSymbol:
@@ -283,11 +283,11 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
     default: report_fatal_error("Unsupported FP type"); break;
     case Type::FloatTyID:
       MCOp = MCOperand::createExpr(
-        NVPTXFloatMCExpr::CreateConstantFPSingle(Val, OutContext));
+        NVPTXFloatMCExpr::createConstantFPSingle(Val, OutContext));
       break;
     case Type::DoubleTyID:
       MCOp = MCOperand::createExpr(
-        NVPTXFloatMCExpr::CreateConstantFPDouble(Val, OutContext));
+        NVPTXFloatMCExpr::createConstantFPDouble(Val, OutContext));
       break;
     }
     break;
@@ -334,7 +334,7 @@ unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
 
 MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) {
   const MCExpr *Expr;
-  Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
+  Expr = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None,
                                  OutContext);
   return MCOperand::createExpr(Expr);
 }
@@ -418,9 +418,8 @@ void NVPTXAsmPrinter::printReturnValStr(const MachineFunction &MF,
 bool NVPTXAsmPrinter::isLoopHeaderOfNoUnroll(
     const MachineBasicBlock &MBB) const {
   MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
-  // TODO: isLoopHeader() should take "const MachineBasicBlock *".
   // We insert .pragma "nounroll" only to the loop header.
-  if (!LI.isLoopHeader(const_cast<MachineBasicBlock *>(&MBB)))
+  if (!LI.isLoopHeader(&MBB))
     return false;
 
   // llvm.loop.unroll.disable is marked on the back edges of a loop. Therefore,
@@ -468,7 +467,7 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
     printReturnValStr(*MF, O);
   }
 
-  O << *CurrentFnSym;
+  CurrentFnSym->print(O, MAI);
 
   emitFunctionParamList(*MF, O);
 
@@ -625,7 +624,8 @@ void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
   else
     O << ".func ";
   printReturnValStr(F, O);
-  O << *getSymbol(F) << "\n";
+  getSymbol(F)->print(O, MAI);
+  O << "\n";
   emitFunctionParamList(F, O);
   O << ";\n";
 }
@@ -1172,7 +1172,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
     else
       O << getPTXFundamentalTypeStr(ETy, false);
     O << " ";
-    O << *getSymbol(GVar);
+    getSymbol(GVar)->print(O, MAI);
 
     // Ptx allows variable initilization only for constant and global state
     // spaces.
@@ -1189,11 +1189,9 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
         // The frontend adds zero-initializer to variables that don't have an
         // initial value, so skip warning for this case.
         if (!GVar->getInitializer()->isNullValue()) {
-          std::string warnMsg =
-              ("initial value of '" + GVar->getName() +
-               "' is not allowed in addrspace(" +
-               Twine(llvm::utostr_32(PTy->getAddressSpace())) + ")").str();
-          report_fatal_error(warnMsg.c_str());
+          report_fatal_error("initial value of '" + GVar->getName() +
+                             "' is not allowed in addrspace(" +
+                             Twine(PTy->getAddressSpace()) + ")");
         }
       }
     }
@@ -1220,15 +1218,21 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
           bufferAggregateConstant(Initializer, &aggBuffer);
           if (aggBuffer.numSymbols) {
             if (static_cast<const NVPTXTargetMachine &>(TM).is64Bit()) {
-              O << " .u64 " << *getSymbol(GVar) << "[";
+              O << " .u64 ";
+              getSymbol(GVar)->print(O, MAI);
+              O << "[";
               O << ElementSize / 8;
             } else {
-              O << " .u32 " << *getSymbol(GVar) << "[";
+              O << " .u32 ";
+              getSymbol(GVar)->print(O, MAI);
+              O << "[";
               O << ElementSize / 4;
             }
             O << "]";
           } else {
-            O << " .b8 " << *getSymbol(GVar) << "[";
+            O << " .b8 ";
+            getSymbol(GVar)->print(O, MAI);
+            O << "[";
             O << ElementSize;
             O << "]";
           }
@@ -1236,7 +1240,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
           aggBuffer.print();
           O << "}";
         } else {
-          O << " .b8 " << *getSymbol(GVar);
+          O << " .b8 ";
+          getSymbol(GVar)->print(O, MAI);
           if (ElementSize) {
             O << "[";
             O << ElementSize;
@@ -1244,7 +1249,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
           }
         }
       } else {
-        O << " .b8 " << *getSymbol(GVar);
+        O << " .b8 ";
+        getSymbol(GVar)->print(O, MAI);
         if (ElementSize) {
           O << "[";
           O << ElementSize;
@@ -1351,7 +1357,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
     O << " .";
     O << getPTXFundamentalTypeStr(ETy);
     O << " ";
-    O << *getSymbol(GVar);
+    getSymbol(GVar)->print(O, MAI);
     return;
   }
 
@@ -1366,9 +1372,11 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
   case Type::ArrayTyID:
   case Type::VectorTyID:
     ElementSize = TD->getTypeStoreSize(ETy);
-    O << " .b8 " << *getSymbol(GVar) << "[";
+    O << " .b8 ";
+    getSymbol(GVar)->print(O, MAI);
+    O << "[";
     if (ElementSize) {
-      O << itostr(ElementSize);
+      O << ElementSize;
     }
     O << "]";
     break;
@@ -1408,11 +1416,13 @@ static unsigned int getOpenCLAlignment(const DataLayout *TD, Type *Ty) {
 
 void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
                                      int paramIndex, raw_ostream &O) {
-  O << *getSymbol(I->getParent()) << "_param_" << paramIndex;
+  getSymbol(I->getParent())->print(O, MAI);
+  O << "_param_" << paramIndex;
 }
 
 void NVPTXAsmPrinter::printParamName(int paramIndex, raw_ostream &O) {
-  O << *CurrentFnSym << "_param_" << paramIndex;
+  CurrentFnSym->print(O, MAI);
+  O << "_param_" << paramIndex;
 }
 
 void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
@@ -1446,21 +1456,24 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
               O << "\t.param .u64 .ptr .surfref ";
             else
               O << "\t.param .surfref ";
-            O << *CurrentFnSym << "_param_" << paramIndex;
+            CurrentFnSym->print(O, MAI);
+            O << "_param_" << paramIndex;
           }
           else { // Default image is read_only
             if (nvptxSubtarget->hasImageHandles())
               O << "\t.param .u64 .ptr .texref ";
             else
               O << "\t.param .texref ";
-            O << *CurrentFnSym << "_param_" << paramIndex;
+            CurrentFnSym->print(O, MAI);
+            O << "_param_" << paramIndex;
           }
         } else {
           if (nvptxSubtarget->hasImageHandles())
             O << "\t.param .u64 .ptr .samplerref ";
           else
             O << "\t.param .samplerref ";
-          O << *CurrentFnSym << "_param_" << paramIndex;
+          CurrentFnSym->print(O, MAI);
+          O << "_param_" << paramIndex;
         }
         continue;
       }
@@ -1716,10 +1729,10 @@ void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
     }
     if (EmitGeneric && !isa<Function>(CPV) && !IsNonGenericPointer) {
       O << "generic(";
-      O << *getSymbol(GVar);
+      getSymbol(GVar)->print(O, MAI);
       O << ")";
     } else {
-      O << *getSymbol(GVar);
+      getSymbol(GVar)->print(O, MAI);
     }
     return;
   }
@@ -1733,20 +1746,44 @@ void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
     if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
       if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) {
         O << "generic(";
-        O << *getSymbol(GVar);
+        getSymbol(GVar)->print(O, MAI);
         O << ")";
       } else {
-        O << *getSymbol(GVar);
+        getSymbol(GVar)->print(O, MAI);
       }
       return;
     } else {
-      O << *lowerConstant(CPV);
+      lowerConstant(CPV)->print(O, MAI);
       return;
     }
   }
   llvm_unreachable("Not scalar type found in printScalarConstant()");
 }
 
+// These utility functions assure we get the right sequence of bytes for a given
+// type even for big-endian machines
+template <typename T> static void ConvertIntToBytes(unsigned char *p, T val) {
+  int64_t vp = (int64_t)val;
+  for (unsigned i = 0; i < sizeof(T); ++i) {
+    p[i] = (unsigned char)vp;
+    vp >>= 8;
+  }
+}
+static void ConvertFloatToBytes(unsigned char *p, float val) {
+  int32_t *vp = (int32_t *)&val;
+  for (unsigned i = 0; i < sizeof(int32_t); ++i) {
+    p[i] = (unsigned char)*vp;
+    *vp >>= 8;
+  }
+}
+static void ConvertDoubleToBytes(unsigned char *p, double val) {
+  int64_t *vp = (int64_t *)&val;
+  for (unsigned i = 0; i < sizeof(int64_t); ++i) {
+    p[i] = (unsigned char)*vp;
+    *vp >>= 8;
+  }
+}
+
 void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
                                    AggBuffer *aggBuffer) {
 
@@ -1760,30 +1797,30 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
     return;
   }
 
-  unsigned char *ptr;
+  unsigned char ptr[8];
   switch (CPV->getType()->getTypeID()) {
 
   case Type::IntegerTyID: {
     const Type *ETy = CPV->getType();
     if (ETy == Type::getInt8Ty(CPV->getContext())) {
       unsigned char c = (unsigned char)cast<ConstantInt>(CPV)->getZExtValue();
-      ptr = &c;
+      ConvertIntToBytes<>(ptr, c);
       aggBuffer->addBytes(ptr, 1, Bytes);
     } else if (ETy == Type::getInt16Ty(CPV->getContext())) {
       short int16 = (short)cast<ConstantInt>(CPV)->getZExtValue();
-      ptr = (unsigned char *)&int16;
+      ConvertIntToBytes<>(ptr, int16);
       aggBuffer->addBytes(ptr, 2, Bytes);
     } else if (ETy == Type::getInt32Ty(CPV->getContext())) {
       if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
         int int32 = (int)(constInt->getZExtValue());
-        ptr = (unsigned char *)&int32;
+        ConvertIntToBytes<>(ptr, int32);
         aggBuffer->addBytes(ptr, 4, Bytes);
         break;
       } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
         if (const ConstantInt *constInt = dyn_cast<ConstantInt>(
                 ConstantFoldConstantExpression(Cexpr, *TD))) {
           int int32 = (int)(constInt->getZExtValue());
-          ptr = (unsigned char *)&int32;
+          ConvertIntToBytes<>(ptr, int32);
           aggBuffer->addBytes(ptr, 4, Bytes);
           break;
         }
@@ -1798,14 +1835,14 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
     } else if (ETy == Type::getInt64Ty(CPV->getContext())) {
       if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
         long long int64 = (long long)(constInt->getZExtValue());
-        ptr = (unsigned char *)&int64;
+        ConvertIntToBytes<>(ptr, int64);
         aggBuffer->addBytes(ptr, 8, Bytes);
         break;
       } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
         if (const ConstantInt *constInt = dyn_cast<ConstantInt>(
                 ConstantFoldConstantExpression(Cexpr, *TD))) {
           long long int64 = (long long)(constInt->getZExtValue());
-          ptr = (unsigned char *)&int64;
+          ConvertIntToBytes<>(ptr, int64);
           aggBuffer->addBytes(ptr, 8, Bytes);
           break;
         }
@@ -1827,11 +1864,11 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
     const Type *Ty = CFP->getType();
     if (Ty == Type::getFloatTy(CPV->getContext())) {
       float float32 = (float) CFP->getValueAPF().convertToFloat();
-      ptr = (unsigned char *)&float32;
+      ConvertFloatToBytes(ptr, float32);
       aggBuffer->addBytes(ptr, 4, Bytes);
     } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
       double float64 = CFP->getValueAPF().convertToDouble();
-      ptr = (unsigned char *)&float64;
+      ConvertDoubleToBytes(ptr, float64);
       aggBuffer->addBytes(ptr, 8, Bytes);
     } else {
       llvm_unreachable("unsupported fp const type");
@@ -1993,16 +2030,16 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
   MCContext &Ctx = OutContext;
 
   if (CV->isNullValue() || isa<UndefValue>(CV))
-    return MCConstantExpr::Create(0, Ctx);
+    return MCConstantExpr::create(0, Ctx);
 
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV))
-    return MCConstantExpr::Create(CI->getZExtValue(), Ctx);
+    return MCConstantExpr::create(CI->getZExtValue(), Ctx);
 
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV)) {
     const MCSymbolRefExpr *Expr =
-      MCSymbolRefExpr::Create(getSymbol(GV), Ctx);
+      MCSymbolRefExpr::create(getSymbol(GV), Ctx);
     if (ProcessingGeneric) {
-      return NVPTXGenericMCSymbolRefExpr::Create(Expr, Ctx);
+      return NVPTXGenericMCSymbolRefExpr::create(Expr, Ctx);
     } else {
       return Expr;
     }
@@ -2059,7 +2096,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
       return Base;
 
     int64_t Offset = OffsetAI.getSExtValue();
-    return MCBinaryExpr::CreateAdd(Base, MCConstantExpr::Create(Offset, Ctx),
+    return MCBinaryExpr::createAdd(Base, MCConstantExpr::create(Offset, Ctx),
                                    Ctx);
   }
 
@@ -2102,8 +2139,8 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
     // the high bits so we are sure to get a proper truncation if the input is
     // a constant expr.
     unsigned InBits = DL.getTypeAllocSizeInBits(Op->getType());
-    const MCExpr *MaskExpr = MCConstantExpr::Create(~0ULL >> (64-InBits), Ctx);
-    return MCBinaryExpr::CreateAnd(OpExpr, MaskExpr, Ctx);
+    const MCExpr *MaskExpr = MCConstantExpr::create(~0ULL >> (64-InBits), Ctx);
+    return MCBinaryExpr::createAnd(OpExpr, MaskExpr, Ctx);
   }
 
   // The MC library also has a right-shift operator, but it isn't consistently
@@ -2113,7 +2150,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
     const MCExpr *RHS = lowerConstantForGV(CE->getOperand(1), ProcessingGeneric);
     switch (CE->getOpcode()) {
     default: llvm_unreachable("Unknown binary operator constant cast expr");
-    case Instruction::Add: return MCBinaryExpr::CreateAdd(LHS, RHS, Ctx);
+    case Instruction::Add: return MCBinaryExpr::createAdd(LHS, RHS, Ctx);
     }
   }
   }
@@ -2123,7 +2160,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
 void NVPTXAsmPrinter::printMCExpr(const MCExpr &Expr, raw_ostream &OS) {
   switch (Expr.getKind()) {
   case MCExpr::Target:
-    return cast<MCTargetExpr>(&Expr)->PrintImpl(OS);
+    return cast<MCTargetExpr>(&Expr)->printImpl(OS, MAI);
   case MCExpr::Constant:
     OS << cast<MCConstantExpr>(Expr).getValue();
     return;
@@ -2131,7 +2168,7 @@ void NVPTXAsmPrinter::printMCExpr(const MCExpr &Expr, raw_ostream &OS) {
   case MCExpr::SymbolRef: {
     const MCSymbolRefExpr &SRE = cast<MCSymbolRefExpr>(Expr);
     const MCSymbol &Sym = SRE.getSymbol();
-    OS << Sym;
+    Sym.print(OS, MAI);
     return;
   }
 
@@ -2256,11 +2293,11 @@ void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
     break;
 
   case MachineOperand::MO_GlobalAddress:
-    O << *getSymbol(MO.getGlobal());
+    getSymbol(MO.getGlobal())->print(O, MAI);
     break;
 
   case MachineOperand::MO_MachineBasicBlock:
-    O << *MO.getMBB()->getSymbol();
+    MO.getMBB()->getSymbol()->print(O, MAI);
     return;
 
   default:
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 301c686..f6f7685 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -165,10 +165,10 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
               }
               if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) {
                 O << "generic(";
-                O << *Name;
+                Name->print(O, AP.MAI);
                 O << ")";
               } else {
-                O << *Name;
+                Name->print(O, AP.MAI);
               }
             } else if (const ConstantExpr *CExpr = dyn_cast<ConstantExpr>(v0)) {
               const MCExpr *Expr =
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
index ae63cae..cfff001 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
@@ -10,34 +10,54 @@
 // When a load/store accesses the generic address space, checks whether the
 // address is casted from a non-generic address space. If so, remove this
 // addrspacecast because accessing non-generic address spaces is typically
-// faster. Besides seeking addrspacecasts, this optimization also traces into
-// the base pointer of a GEP.
+// faster. Besides removing addrspacecasts directly used by loads/stores, this
+// optimization also recursively traces into a GEP's pointer operand and a
+// bitcast's source to find more eliminable addrspacecasts.
 //
 // For instance, the code below loads a float from an array allocated in
 // addrspace(3).
 //
-// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
-// %1 = gep [10 x float]* %0, i64 0, i64 %i
-// %2 = load float* %1 ; emits ld.f32
+//   %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
+//   %1 = gep [10 x float]* %0, i64 0, i64 %i
+//   %2 = bitcast float* %1 to i32*
+//   %3 = load i32* %2 ; emits ld.u32
 //
-// First, function hoistAddrSpaceCastFromGEP reorders the addrspacecast
-// and the GEP to expose more optimization opportunities to function
+// First, function hoistAddrSpaceCastFrom reorders the addrspacecast, the GEP,
+// and the bitcast to expose more optimization opportunities to function
 // optimizeMemoryInst. The intermediate code looks like:
 //
-// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
-// %1 = addrspacecast float addrspace(3)* %0 to float*
-// %2 = load float* %1 ; still emits ld.f32, but will be optimized shortly
+//   %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+//   %1 = bitcast float addrspace(3)* %0 to i32 addrspace(3)*
+//   %2 = addrspacecast i32 addrspace(3)* %1 to i32*
+//   %3 = load i32* %2 ; still emits ld.u32, but will be optimized shortly
 //
 // Then, function optimizeMemoryInstruction detects a load from addrspacecast'ed
 // generic pointers, and folds the load and the addrspacecast into a load from
 // the original address space. The final code looks like:
 //
-// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
-// %2 = load float addrspace(3)* %0 ; emits ld.shared.f32
+//   %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+//   %1 = bitcast float addrspace(3)* %0 to i32 addrspace(3)*
+//   %3 = load i32 addrspace(3)* %1 ; emits ld.shared.f32
 //
 // This pass may remove an addrspacecast in a different BB. Therefore, we
 // implement it as a FunctionPass.
 //
+// TODO:
+// The current implementation doesn't handle PHINodes. Eliminating
+// addrspacecasts used by PHINodes is trickier because PHINodes can introduce
+// loops in data flow. For example,
+//
+//     %generic.input = addrspacecast float addrspace(3)* %input to float*
+//   loop:
+//     %y = phi [ %generic.input, %y2 ]
+//     %y2 = getelementptr %y, 1
+//     %v = load %y2
+//     br ..., label %loop, ...
+//
+// Marking %y2 shared depends on marking %y shared, but %y also data-flow
+// depends on %y2. We probably need an iterative fix-point algorithm on handle
+// this case.
+//
 //===----------------------------------------------------------------------===//
 
 #include "NVPTX.h"
@@ -62,17 +82,31 @@ class NVPTXFavorNonGenericAddrSpaces : public FunctionPass {
 public:
   static char ID;
   NVPTXFavorNonGenericAddrSpaces() : FunctionPass(ID) {}
-
   bool runOnFunction(Function &F) override;
 
+private:
   /// Optimizes load/store instructions. Idx is the index of the pointer operand
   /// (0 for load, and 1 for store). Returns true if it changes anything.
   bool optimizeMemoryInstruction(Instruction *I, unsigned Idx);
+  /// Recursively traces into a GEP's pointer operand or a bitcast's source to
+  /// find an eliminable addrspacecast, and hoists that addrspacecast to the
+  /// outermost level. For example, this function transforms
+  ///   bitcast(gep(gep(addrspacecast(X))))
+  /// to
+  ///   addrspacecast(bitcast(gep(gep(X)))).
+  ///
+  /// This reordering exposes to optimizeMemoryInstruction more
+  /// optimization opportunities on loads and stores.
+  ///
+  /// Returns true if this function succesfully hoists an eliminable
+  /// addrspacecast or V is already such an addrspacecast.
   /// Transforms "gep (addrspacecast X), indices" into "addrspacecast (gep X,
-  /// indices)".  This reordering exposes to optimizeMemoryInstruction more
-  /// optimization opportunities on loads and stores. Returns true if it changes
-  /// the program.
-  bool hoistAddrSpaceCastFromGEP(GEPOperator *GEP);
+  /// indices)".
+  bool hoistAddrSpaceCastFrom(Value *V, int Depth = 0);
+  /// Helper function for GEPs.
+  bool hoistAddrSpaceCastFromGEP(GEPOperator *GEP, int Depth);
+  /// Helper function for bitcasts.
+  bool hoistAddrSpaceCastFromBitCast(BitCastOperator *BC, int Depth);
 };
 }
 
@@ -85,11 +119,12 @@ INITIALIZE_PASS(NVPTXFavorNonGenericAddrSpaces, "nvptx-favor-non-generic",
                 "Remove unnecessary non-generic-to-generic addrspacecasts",
                 false, false)
 
-// Decides whether removing Cast is valid and beneficial. Cast can be an
-// instruction or a constant expression.
-static bool IsEliminableAddrSpaceCast(Operator *Cast) {
-  // Returns false if not even an addrspacecast.
-  if (Cast->getOpcode() != Instruction::AddrSpaceCast)
+// Decides whether V is an addrspacecast and shortcutting V in load/store is
+// valid and beneficial.
+static bool isEliminableAddrSpaceCast(Value *V) {
+  // Returns false if V is not even an addrspacecast.
+  Operator *Cast = dyn_cast<Operator>(V);
+  if (Cast == nullptr || Cast->getOpcode() != Instruction::AddrSpaceCast)
     return false;
 
   Value *Src = Cast->getOperand(0);
@@ -108,67 +143,119 @@ static bool IsEliminableAddrSpaceCast(Operator *Cast) {
           DestTy->getAddressSpace() == AddressSpace::ADDRESS_SPACE_GENERIC);
 }
 
-bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP(
-    GEPOperator *GEP) {
-  Operator *Cast = dyn_cast<Operator>(GEP->getPointerOperand());
-  if (!Cast)
+bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP(GEPOperator *GEP,
+                                                               int Depth) {
+  if (!hoistAddrSpaceCastFrom(GEP->getPointerOperand(), Depth + 1))
     return false;
 
-  if (!IsEliminableAddrSpaceCast(Cast))
-    return false;
+  // That hoistAddrSpaceCastFrom succeeds implies GEP's pointer operand is now
+  // an eliminable addrspacecast.
+  assert(isEliminableAddrSpaceCast(GEP->getPointerOperand()));
+  Operator *Cast = cast<Operator>(GEP->getPointerOperand());
 
   SmallVector<Value *, 8> Indices(GEP->idx_begin(), GEP->idx_end());
   if (Instruction *GEPI = dyn_cast<Instruction>(GEP)) {
-    // %1 = gep (addrspacecast X), indices
+    // GEP = gep (addrspacecast X), indices
     // =>
-    // %0 = gep X, indices
-    // %1 = addrspacecast %0
-    GetElementPtrInst *NewGEPI = GetElementPtrInst::Create(
+    // NewGEP = gep X, indices
+    // NewASC = addrspacecast NewGEP
+    GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
         GEP->getSourceElementType(), Cast->getOperand(0), Indices,
-        GEP->getName(), GEPI);
-    NewGEPI->setIsInBounds(GEP->isInBounds());
-    GEP->replaceAllUsesWith(
-        new AddrSpaceCastInst(NewGEPI, GEP->getType(), "", GEPI));
+        "", GEPI);
+    NewGEP->setIsInBounds(GEP->isInBounds());
+    Value *NewASC = new AddrSpaceCastInst(NewGEP, GEP->getType(), "", GEPI);
+    NewASC->takeName(GEP);
+    GEP->replaceAllUsesWith(NewASC);
   } else {
     // GEP is a constant expression.
-    Constant *NewGEPCE = ConstantExpr::getGetElementPtr(
+    Constant *NewGEP = ConstantExpr::getGetElementPtr(
         GEP->getSourceElementType(), cast<Constant>(Cast->getOperand(0)),
         Indices, GEP->isInBounds());
     GEP->replaceAllUsesWith(
-        ConstantExpr::getAddrSpaceCast(NewGEPCE, GEP->getType()));
+        ConstantExpr::getAddrSpaceCast(NewGEP, GEP->getType()));
   }
 
   return true;
 }
 
-bool NVPTXFavorNonGenericAddrSpaces::optimizeMemoryInstruction(Instruction *MI,
-                                                               unsigned Idx) {
-  // If the pointer operand is a GEP, hoist the addrspacecast if any from the
-  // GEP to expose more optimization opportunites.
-  if (GEPOperator *GEP = dyn_cast<GEPOperator>(MI->getOperand(Idx))) {
-    hoistAddrSpaceCastFromGEP(GEP);
-  }
+bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromBitCast(
+    BitCastOperator *BC, int Depth) {
+  if (!hoistAddrSpaceCastFrom(BC->getOperand(0), Depth + 1))
+    return false;
 
-  // load/store (addrspacecast X) => load/store X if shortcutting the
-  // addrspacecast is valid and can improve performance.
-  //
-  // e.g.,
-  // %1 = addrspacecast float addrspace(3)* %0 to float*
-  // %2 = load float* %1
-  // ->
-  // %2 = load float addrspace(3)* %0
-  //
-  // Note: the addrspacecast can also be a constant expression.
-  if (Operator *Cast = dyn_cast<Operator>(MI->getOperand(Idx))) {
-    if (IsEliminableAddrSpaceCast(Cast)) {
-      MI->setOperand(Idx, Cast->getOperand(0));
-      return true;
-    }
+  // That hoistAddrSpaceCastFrom succeeds implies BC's source operand is now
+  // an eliminable addrspacecast.
+  assert(isEliminableAddrSpaceCast(BC->getOperand(0)));
+  Operator *Cast = cast<Operator>(BC->getOperand(0));
+
+  // Cast  = addrspacecast Src
+  // BC    = bitcast Cast
+  //   =>
+  // Cast' = bitcast Src
+  // BC'   = addrspacecast Cast'
+  Value *Src = Cast->getOperand(0);
+  Type *TypeOfNewCast =
+      PointerType::get(BC->getType()->getPointerElementType(),
+                       Src->getType()->getPointerAddressSpace());
+  if (BitCastInst *BCI = dyn_cast<BitCastInst>(BC)) {
+    Value *NewCast = new BitCastInst(Src, TypeOfNewCast, "", BCI);
+    Value *NewBC = new AddrSpaceCastInst(NewCast, BC->getType(), "", BCI);
+    NewBC->takeName(BC);
+    BC->replaceAllUsesWith(NewBC);
+  } else {
+    // BC is a constant expression.
+    Constant *NewCast =
+        ConstantExpr::getBitCast(cast<Constant>(Src), TypeOfNewCast);
+    Constant *NewBC = ConstantExpr::getAddrSpaceCast(NewCast, BC->getType());
+    BC->replaceAllUsesWith(NewBC);
   }
+  return true;
+}
+
+bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFrom(Value *V,
+                                                            int Depth) {
+  // Returns true if V is already an eliminable addrspacecast.
+  if (isEliminableAddrSpaceCast(V))
+    return true;
+
+  // Limit the depth to prevent this recursive function from running too long.
+  const int MaxDepth = 20;
+  if (Depth >= MaxDepth)
+    return false;
+
+  // If V is a GEP or bitcast, hoist the addrspacecast if any from its pointer
+  // operand. This enables optimizeMemoryInstruction to shortcut addrspacecasts
+  // that are not directly used by the load/store.
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(V))
+    return hoistAddrSpaceCastFromGEP(GEP, Depth);
+
+  if (BitCastOperator *BC = dyn_cast<BitCastOperator>(V))
+    return hoistAddrSpaceCastFromBitCast(BC, Depth);
 
   return false;
 }
 
+bool NVPTXFavorNonGenericAddrSpaces::optimizeMemoryInstruction(Instruction *MI,
+                                                               unsigned Idx) {
+  if (hoistAddrSpaceCastFrom(MI->getOperand(Idx))) {
+    // load/store (addrspacecast X) => load/store X if shortcutting the
+    // addrspacecast is valid and can improve performance.
+    //
+    // e.g.,
+    // %1 = addrspacecast float addrspace(3)* %0 to float*
+    // %2 = load float* %1
+    // ->
+    // %2 = load float addrspace(3)* %0
+    //
+    // Note: the addrspacecast can also be a constant expression.
+    assert(isEliminableAddrSpaceCast(MI->getOperand(Idx)));
+    Operator *ASC = dyn_cast<Operator>(MI->getOperand(Idx));
+    MI->setOperand(Idx, ASC->getOperand(0));
+    return true;
+  }
+  return false;
+}
+
 bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) {
   if (DisableFavorNonGeneric)
     return false;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index fa38a68..232a611 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -613,6 +613,10 @@ SDNode *NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
       Opc =
           TM.is64Bit() ? NVPTX::cvta_to_local_yes_64 : NVPTX::cvta_to_local_yes;
       break;
+    case ADDRESS_SPACE_PARAM:
+      Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
+                         : NVPTX::nvvm_ptr_gen_to_param;
+      break;
     }
     return CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), Src);
   }
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 805847a..b5af72a 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -3725,7 +3725,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
 /// (LoopStrengthReduce.cpp) and memory optimization for address mode
 /// (CodeGenPrepare.cpp)
 bool NVPTXTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                Type *Ty) const {
+                                                Type *Ty,
+                                                unsigned AS) const {
 
   // AddrMode - This represents an addressing mode of:
   //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 5142ae3..ed94775 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -456,7 +456,8 @@ public:
   /// Used to guide target specific optimizations, like loop strength
   /// reduction (LoopStrengthReduce.cpp) and memory optimization for
   /// address mode (CodeGenPrepare.cpp)
-  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty,
+                             unsigned AS) const override;
 
   /// getFunctionAlignment - Return the Log2 alignment of this function.
   unsigned getFunctionAlignment(const Function *F) const;
@@ -497,12 +498,6 @@ public:
                                     std::vector<SDValue> &Ops,
                                     SelectionDAG &DAG) const override;
 
-  unsigned getInlineAsmMemConstraint(
-      const std::string &ConstraintCode) const override {
-    // FIXME: Map different constraints differently.
-    return InlineAsm::Constraint_m;
-  }
-
   const NVPTXTargetMachine *nvTM;
 
   // PTX always uses 32-bit shift amounts
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
new file mode 100644
index 0000000..24dcb12
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
@@ -0,0 +1,170 @@
+//===-- NVPTXLowerKernelArgs.cpp - Lower kernel arguments -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Pointer arguments to kernel functions need to be lowered specially.
+//
+// 1. Copy byval struct args to local memory. This is a preparation for handling
+//    cases like
+//
+//    kernel void foo(struct A arg, ...)
+//    {
+//      struct A *p = &arg;
+//      ...
+//      ... = p->filed1 ...  (this is no generic address for .param)
+//      p->filed2 = ...      (this is no write access to .param)
+//    }
+//
+// 2. Convert non-byval pointer arguments of CUDA kernels to pointers in the
+//    global address space. This allows later optimizations to emit
+//    ld.global.*/st.global.* for accessing these pointer arguments. For
+//    example,
+//
+//    define void @foo(float* %input) {
+//      %v = load float, float* %input, align 4
+//      ...
+//    }
+//
+//    becomes
+//
+//    define void @foo(float* %input) {
+//      %input2 = addrspacecast float* %input to float addrspace(1)*
+//      %input3 = addrspacecast float addrspace(1)* %input2 to float*
+//      %v = load float, float* %input3, align 4
+//      ...
+//    }
+//
+//    Later, NVPTXFavorNonGenericAddrSpaces will optimize it to
+//
+//    define void @foo(float* %input) {
+//      %input2 = addrspacecast float* %input to float addrspace(1)*
+//      %v = load float, float addrspace(1)* %input2, align 4
+//      ...
+//    }
+//
+// TODO: merge this pass with NVPTXFavorNonGenericAddrSpace so that other passes
+// don't cancel the addrspacecast pair this pass emits.
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXUtilities.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeNVPTXLowerKernelArgsPass(PassRegistry &);
+}
+
+namespace {
+class NVPTXLowerKernelArgs : public FunctionPass {
+  bool runOnFunction(Function &F) override;
+
+  // handle byval parameters
+  void handleByValParam(Argument *);
+  // handle non-byval pointer parameters
+  void handlePointerParam(Argument *);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  NVPTXLowerKernelArgs(const NVPTXTargetMachine *TM = nullptr)
+      : FunctionPass(ID), TM(TM) {}
+  const char *getPassName() const override {
+    return "Lower pointer arguments of CUDA kernels";
+  }
+
+private:
+  const NVPTXTargetMachine *TM;
+};
+} // namespace
+
+char NVPTXLowerKernelArgs::ID = 1;
+
+INITIALIZE_PASS(NVPTXLowerKernelArgs, "nvptx-lower-kernel-args",
+                "Lower kernel arguments (NVPTX)", false, false)
+
+// =============================================================================
+// If the function had a byval struct ptr arg, say foo(%struct.x *byval %d),
+// then add the following instructions to the first basic block:
+//
+// %temp = alloca %struct.x, align 8
+// %tempd = addrspacecast %struct.x* %d to %struct.x addrspace(101)*
+// %tv = load %struct.x addrspace(101)* %tempd
+// store %struct.x %tv, %struct.x* %temp, align 8
+//
+// The above code allocates some space in the stack and copies the incoming
+// struct from param space to local space.
+// Then replace all occurences of %d by %temp.
+// =============================================================================
+void NVPTXLowerKernelArgs::handleByValParam(Argument *Arg) {
+  Function *Func = Arg->getParent();
+  Instruction *FirstInst = &(Func->getEntryBlock().front());
+  PointerType *PType = dyn_cast<PointerType>(Arg->getType());
+
+  assert(PType && "Expecting pointer type in handleByValParam");
+
+  Type *StructType = PType->getElementType();
+  AllocaInst *AllocA = new AllocaInst(StructType, Arg->getName(), FirstInst);
+  // Set the alignment to alignment of the byval parameter. This is because,
+  // later load/stores assume that alignment, and we are going to replace
+  // the use of the byval parameter with this alloca instruction.
+  AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo() + 1));
+  Arg->replaceAllUsesWith(AllocA);
+
+  Value *ArgInParam = new AddrSpaceCastInst(
+      Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
+      FirstInst);
+  LoadInst *LI = new LoadInst(ArgInParam, Arg->getName(), FirstInst);
+  new StoreInst(LI, AllocA, FirstInst);
+}
+
+void NVPTXLowerKernelArgs::handlePointerParam(Argument *Arg) {
+  assert(!Arg->hasByValAttr() &&
+         "byval params should be handled by handleByValParam");
+
+  Instruction *FirstInst = Arg->getParent()->getEntryBlock().begin();
+  Instruction *ArgInGlobal = new AddrSpaceCastInst(
+      Arg, PointerType::get(Arg->getType()->getPointerElementType(),
+                            ADDRESS_SPACE_GLOBAL),
+      Arg->getName(), FirstInst);
+  Value *ArgInGeneric = new AddrSpaceCastInst(ArgInGlobal, Arg->getType(),
+                                              Arg->getName(), FirstInst);
+  // Replace with ArgInGeneric all uses of Args except ArgInGlobal.
+  Arg->replaceAllUsesWith(ArgInGeneric);
+  ArgInGlobal->setOperand(0, Arg);
+}
+
+
+// =============================================================================
+// Main function for this pass.
+// =============================================================================
+bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
+  // Skip non-kernels. See the comments at the top of this file.
+  if (!isKernelFunction(F))
+    return false;
+
+  for (Argument &Arg : F.args()) {
+    if (Arg.getType()->isPointerTy()) {
+      if (Arg.hasByValAttr())
+        handleByValParam(&Arg);
+      else if (TM && TM->getDrvInterface() == NVPTX::CUDA)
+        handlePointerParam(&Arg);
+    }
+  }
+  return true;
+}
+
+FunctionPass *
+llvm::createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM) {
+  return new NVPTXLowerKernelArgs(TM);
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerStructArgs.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerStructArgs.cpp
deleted file mode 100644
index 68dfbb7..0000000
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerStructArgs.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-//===-- NVPTXLowerStructArgs.cpp - Copy struct args to local memory =====--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Copy struct args to local memory. This is needed for kernel functions only.
-// This is a preparation for handling cases like
-//
-// kernel void foo(struct A arg, ...)
-// {
-//     struct A *p = &arg;
-//     ...
-//     ... = p->filed1 ...  (this is no generic address for .param)
-//     p->filed2 = ...      (this is no write access to .param)
-// }
-//
-//===----------------------------------------------------------------------===//
-
-#include "NVPTX.h"
-#include "NVPTXUtilities.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Pass.h"
-
-using namespace llvm;
-
-namespace llvm {
-void initializeNVPTXLowerStructArgsPass(PassRegistry &);
-}
-
-namespace {
-class NVPTXLowerStructArgs : public FunctionPass {
-  bool runOnFunction(Function &F) override;
-
-  void handleStructPtrArgs(Function &);
-  void handleParam(Argument *);
-
-public:
-  static char ID; // Pass identification, replacement for typeid
-  NVPTXLowerStructArgs() : FunctionPass(ID) {}
-  const char *getPassName() const override {
-    return "Copy structure (byval *) arguments to stack";
-  }
-};
-} // namespace
-
-char NVPTXLowerStructArgs::ID = 1;
-
-INITIALIZE_PASS(NVPTXLowerStructArgs, "nvptx-lower-struct-args",
-                "Lower structure arguments (NVPTX)", false, false)
-
-void NVPTXLowerStructArgs::handleParam(Argument *Arg) {
-  Function *Func = Arg->getParent();
-  Instruction *FirstInst = &(Func->getEntryBlock().front());
-  PointerType *PType = dyn_cast<PointerType>(Arg->getType());
-
-  assert(PType && "Expecting pointer type in handleParam");
-
-  Type *StructType = PType->getElementType();
-  AllocaInst *AllocA = new AllocaInst(StructType, Arg->getName(), FirstInst);
-
-  /* Set the alignment to alignment of the byval parameter. This is because,
-   * later load/stores assume that alignment, and we are going to replace
-   * the use of the byval parameter with this alloca instruction.
-   */
-  AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo() + 1));
-
-  Arg->replaceAllUsesWith(AllocA);
-
-  // Get the cvt.gen.to.param intrinsic
-  Type *CvtTypes[] = {
-      Type::getInt8PtrTy(Func->getParent()->getContext(), ADDRESS_SPACE_PARAM),
-      Type::getInt8PtrTy(Func->getParent()->getContext(),
-                         ADDRESS_SPACE_GENERIC)};
-  Function *CvtFunc = Intrinsic::getDeclaration(
-      Func->getParent(), Intrinsic::nvvm_ptr_gen_to_param, CvtTypes);
-
-  Value *BitcastArgs[] = {
-      new BitCastInst(Arg, Type::getInt8PtrTy(Func->getParent()->getContext(),
-                                              ADDRESS_SPACE_GENERIC),
-                      Arg->getName(), FirstInst)};
-  CallInst *CallCVT =
-      CallInst::Create(CvtFunc, BitcastArgs, "cvt_to_param", FirstInst);
-
-  BitCastInst *BitCast = new BitCastInst(
-      CallCVT, PointerType::get(StructType, ADDRESS_SPACE_PARAM),
-      Arg->getName(), FirstInst);
-  LoadInst *LI = new LoadInst(BitCast, Arg->getName(), FirstInst);
-  new StoreInst(LI, AllocA, FirstInst);
-}
-
-// =============================================================================
-// If the function had a struct ptr arg, say foo(%struct.x *byval %d), then
-// add the following instructions to the first basic block :
-//
-// %temp = alloca %struct.x, align 8
-// %tt1 = bitcast %struct.x * %d to i8 *
-// %tt2 = llvm.nvvm.cvt.gen.to.param %tt2
-// %tempd = bitcast i8 addrspace(101) * to %struct.x addrspace(101) *
-// %tv = load %struct.x addrspace(101) * %tempd
-// store %struct.x %tv, %struct.x * %temp, align 8
-//
-// The above code allocates some space in the stack and copies the incoming
-// struct from param space to local space.
-// Then replace all occurences of %d by %temp.
-// =============================================================================
-void NVPTXLowerStructArgs::handleStructPtrArgs(Function &F) {
-  for (Argument &Arg : F.args()) {
-    if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) {
-      handleParam(&Arg);
-    }
-  }
-}
-
-// =============================================================================
-// Main function for this pass.
-// =============================================================================
-bool NVPTXLowerStructArgs::runOnFunction(Function &F) {
-  // Skip non-kernels. See the comments at the top of this file.
-  if (!isKernelFunction(F))
-    return false;
-
-  handleStructPtrArgs(F);
-  return true;
-}
-
-FunctionPass *llvm::createNVPTXLowerStructArgsPass() {
-  return new NVPTXLowerStructArgs();
-}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
index 779b65e..3c98b9f 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
@@ -16,11 +16,11 @@ using namespace llvm;
 #define DEBUG_TYPE "nvptx-mcexpr"
 
 const NVPTXFloatMCExpr*
-NVPTXFloatMCExpr::Create(VariantKind Kind, APFloat Flt, MCContext &Ctx) {
+NVPTXFloatMCExpr::create(VariantKind Kind, APFloat Flt, MCContext &Ctx) {
   return new (Ctx) NVPTXFloatMCExpr(Kind, Flt);
 }
 
-void NVPTXFloatMCExpr::PrintImpl(raw_ostream &OS) const {
+void NVPTXFloatMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   bool Ignored;
   unsigned NumHex;
   APFloat APF = getAPFloat();
@@ -47,11 +47,14 @@ void NVPTXFloatMCExpr::PrintImpl(raw_ostream &OS) const {
 }
 
 const NVPTXGenericMCSymbolRefExpr*
-NVPTXGenericMCSymbolRefExpr::Create(const MCSymbolRefExpr *SymExpr,
+NVPTXGenericMCSymbolRefExpr::create(const MCSymbolRefExpr *SymExpr,
                                     MCContext &Ctx) {
   return new (Ctx) NVPTXGenericMCSymbolRefExpr(SymExpr);
 }
 
-void NVPTXGenericMCSymbolRefExpr::PrintImpl(raw_ostream &OS) const {
-  OS << "generic(" << *SymExpr << ")";
+void NVPTXGenericMCSymbolRefExpr::printImpl(raw_ostream &OS,
+                                            const MCAsmInfo *MAI) const {
+  OS << "generic(";
+  SymExpr->print(OS, MAI);
+  OS << ")";
 }
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h
index 8c6b219..46b4b33 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h
@@ -36,17 +36,17 @@ public:
   /// @name Construction
   /// @{
 
-  static const NVPTXFloatMCExpr *Create(VariantKind Kind, APFloat Flt,
+  static const NVPTXFloatMCExpr *create(VariantKind Kind, APFloat Flt,
                                         MCContext &Ctx);
 
-  static const NVPTXFloatMCExpr *CreateConstantFPSingle(APFloat Flt,
+  static const NVPTXFloatMCExpr *createConstantFPSingle(APFloat Flt,
                                                         MCContext &Ctx) {
-    return Create(VK_NVPTX_SINGLE_PREC_FLOAT, Flt, Ctx);
+    return create(VK_NVPTX_SINGLE_PREC_FLOAT, Flt, Ctx);
   }
 
-  static const NVPTXFloatMCExpr *CreateConstantFPDouble(APFloat Flt,
+  static const NVPTXFloatMCExpr *createConstantFPDouble(APFloat Flt,
                                                         MCContext &Ctx) {
-    return Create(VK_NVPTX_DOUBLE_PREC_FLOAT, Flt, Ctx);
+    return create(VK_NVPTX_DOUBLE_PREC_FLOAT, Flt, Ctx);
   }
 
   /// @}
@@ -61,14 +61,14 @@ public:
 
 /// @}
 
-  void PrintImpl(raw_ostream &OS) const override;
-  bool EvaluateAsRelocatableImpl(MCValue &Res,
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAsmLayout *Layout,
                                  const MCFixup *Fixup) const override {
     return false;
   }
   void visitUsedExpr(MCStreamer &Streamer) const override {};
-  MCSection *FindAssociatedSection() const override { return nullptr; }
+  MCSection *findAssociatedSection() const override { return nullptr; }
 
   // There are no TLS NVPTXMCExprs at the moment.
   void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
@@ -92,7 +92,7 @@ public:
   /// @{
 
   static const NVPTXGenericMCSymbolRefExpr
-  *Create(const MCSymbolRefExpr *SymExpr, MCContext &Ctx);
+  *create(const MCSymbolRefExpr *SymExpr, MCContext &Ctx);
 
   /// @}
   /// @name Accessors
@@ -103,14 +103,14 @@ public:
 
   /// @}
 
-  void PrintImpl(raw_ostream &OS) const override;
-  bool EvaluateAsRelocatableImpl(MCValue &Res,
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAsmLayout *Layout,
                                  const MCFixup *Fixup) const override {
     return false;
   }
   void visitUsedExpr(MCStreamer &Streamer) const override {};
-  MCSection *FindAssociatedSection() const override { return nullptr; }
+  MCSection *findAssociatedSection() const override { return nullptr; }
 
   // There are no TLS NVPTXMCExprs at the moment.
   void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index ac27c30..a646668 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -53,7 +53,7 @@ void initializeGenericToNVVMPass(PassRegistry&);
 void initializeNVPTXAllocaHoistingPass(PassRegistry &);
 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
 void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
-void initializeNVPTXLowerStructArgsPass(PassRegistry &);
+void initializeNVPTXLowerKernelArgsPass(PassRegistry &);
 }
 
 extern "C" void LLVMInitializeNVPTXTarget() {
@@ -69,7 +69,7 @@ extern "C" void LLVMInitializeNVPTXTarget() {
   initializeNVPTXAssignValidGlobalNamesPass(*PassRegistry::getPassRegistry());
   initializeNVPTXFavorNonGenericAddrSpacesPass(
     *PassRegistry::getPassRegistry());
-  initializeNVPTXLowerStructArgsPass(*PassRegistry::getPassRegistry());
+  initializeNVPTXLowerKernelArgsPass(*PassRegistry::getPassRegistry());
 }
 
 static std::string computeDataLayout(bool is64Bit) {
@@ -163,7 +163,13 @@ void NVPTXPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
   addPass(createNVPTXAssignValidGlobalNamesPass());
   addPass(createGenericToNVVMPass());
+  addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine()));
   addPass(createNVPTXFavorNonGenericAddrSpacesPass());
+  // NVPTXLowerKernelArgs emits alloca for byval parameters which can often
+  // be eliminated by SROA. We do not run SROA right after NVPTXLowerKernelArgs
+  // because we plan to merge NVPTXLowerKernelArgs and
+  // NVPTXFavorNonGenericAddrSpaces into one pass.
+  addPass(createSROAPass());
   // FavorNonGenericAddrSpaces shortcuts unnecessary addrspacecasts, and leave
   // them unused. We could remove dead code in an ad-hoc manner, but that
   // requires manual work and might be error-prone.
@@ -181,6 +187,9 @@ void NVPTXPassConfig::addIRPasses() {
     addPass(createEarlyCSEPass());
   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
   addPass(createNaryReassociatePass());
+  // NaryReassociate on GEPs creates redundant common expressions, so run
+  // EarlyCSE after it.
+  addPass(createEarlyCSEPass());
 }
 
 bool NVPTXPassConfig::addInstSelector() {