1 files changed, 272 insertions, 102 deletions
diff --git a/contrib/llvm/tools/clang/lib/Driver/Driver.cpp b/contrib/llvm/tools/clang/lib/Driver/Driver.cpp
index b9dc35d..180c412 100644
--- a/contrib/llvm/tools/clang/lib/Driver/Driver.cpp
+++ b/contrib/llvm/tools/clang/lib/Driver/Driver.cpp
@@ -174,8 +174,10 @@ phases::ID Driver::getFinalPhase(const DerivedArgList &DAL,
   } else if ((PhaseArg = DAL.getLastArg(options::OPT_S))) {
     FinalPhase = phases::Backend;
 
-    // -c only runs up to the assembler.
-  } else if ((PhaseArg = DAL.getLastArg(options::OPT_c))) {
+    // -c and partial CUDA compilations only run up to the assembler.
+  } else if ((PhaseArg = DAL.getLastArg(options::OPT_c)) ||
+             (PhaseArg = DAL.getLastArg(options::OPT_cuda_device_only)) ||
+             (PhaseArg = DAL.getLastArg(options::OPT_cuda_host_only))) {
     FinalPhase = phases::Assemble;
 
     // Otherwise do everything.
@@ -281,6 +283,83 @@ DerivedArgList *Driver::TranslateInputArgs(const InputArgList &Args) const {
   return DAL;
 }
 
+/// \brief Compute target triple from args.
+///
+/// This routine provides the logic to compute a target triple from various
+/// args passed to the driver and the default triple string.
+static llvm::Triple computeTargetTriple(StringRef DefaultTargetTriple,
+                                        const ArgList &Args,
+                                        StringRef DarwinArchName = "") {
+  // FIXME: Already done in Compilation *Driver::BuildCompilation
+  if (const Arg *A = Args.getLastArg(options::OPT_target))
+    DefaultTargetTriple = A->getValue();
+
+  llvm::Triple Target(llvm::Triple::normalize(DefaultTargetTriple));
+
+  // Handle Apple-specific options available here.
+  if (Target.isOSBinFormatMachO()) {
+    // If an explict Darwin arch name is given, that trumps all.
+    if (!DarwinArchName.empty()) {
+      tools::darwin::setTripleTypeForMachOArchName(Target, DarwinArchName);
+      return Target;
+    }
+
+    // Handle the Darwin '-arch' flag.
+    if (Arg *A = Args.getLastArg(options::OPT_arch)) {
+      StringRef ArchName = A->getValue();
+      tools::darwin::setTripleTypeForMachOArchName(Target, ArchName);
+    }
+  }
+
+  // Handle pseudo-target flags '-mlittle-endian'/'-EL' and
+  // '-mbig-endian'/'-EB'.
+  if (Arg *A = Args.getLastArg(options::OPT_mlittle_endian,
+                               options::OPT_mbig_endian)) {
+    if (A->getOption().matches(options::OPT_mlittle_endian)) {
+      llvm::Triple LE = Target.getLittleEndianArchVariant();
+      if (LE.getArch() != llvm::Triple::UnknownArch)
+        Target = std::move(LE);
+    } else {
+      llvm::Triple BE = Target.getBigEndianArchVariant();
+      if (BE.getArch() != llvm::Triple::UnknownArch)
+        Target = std::move(BE);
+    }
+  }
+
+  // Skip further flag support on OSes which don't support '-m32' or '-m64'.
+  if (Target.getArchName() == "tce" || Target.getOS() == llvm::Triple::Minix)
+    return Target;
+
+  // Handle pseudo-target flags '-m64', '-mx32', '-m32' and '-m16'.
+  if (Arg *A = Args.getLastArg(options::OPT_m64, options::OPT_mx32,
+                               options::OPT_m32, options::OPT_m16)) {
+    llvm::Triple::ArchType AT = llvm::Triple::UnknownArch;
+
+    if (A->getOption().matches(options::OPT_m64)) {
+      AT = Target.get64BitArchVariant().getArch();
+      if (Target.getEnvironment() == llvm::Triple::GNUX32)
+        Target.setEnvironment(llvm::Triple::GNU);
+    } else if (A->getOption().matches(options::OPT_mx32) &&
+               Target.get64BitArchVariant().getArch() == llvm::Triple::x86_64) {
+      AT = llvm::Triple::x86_64;
+      Target.setEnvironment(llvm::Triple::GNUX32);
+    } else if (A->getOption().matches(options::OPT_m32)) {
+      AT = Target.get32BitArchVariant().getArch();
+      if (Target.getEnvironment() == llvm::Triple::GNUX32)
+        Target.setEnvironment(llvm::Triple::GNU);
+    } else if (A->getOption().matches(options::OPT_m16) &&
+               Target.get32BitArchVariant().getArch() == llvm::Triple::x86) {
+      AT = llvm::Triple::x86;
+      Target.setEnvironment(llvm::Triple::CODE16);
+    }
+
+    if (AT != llvm::Triple::UnknownArch && AT != Target.getArch())
+      Target.setArch(AT);
+  }
+
+  return Target;
+}
+
 Compilation *Driver::BuildCompilation(ArrayRef<const char *> ArgList) {
   llvm::PrettyStackTraceString CrashInfo("Compilation construction");
 
@@ -367,7 +446,8 @@ Compilation *Driver::BuildCompilation(ArrayRef<const char *> ArgList) {
   DerivedArgList *TranslatedArgs = TranslateInputArgs(*UArgs);
 
   // Owned by the host.
-  const ToolChain &TC = getToolChain(*UArgs);
+  const ToolChain &TC =
+      getToolChain(*UArgs, computeTargetTriple(DefaultTargetTriple, *UArgs));
 
   // The compilation takes ownership of Args.
   Compilation *C = new Compilation(*this, TC, UArgs.release(), TranslatedArgs);
@@ -398,6 +478,19 @@ Compilation *Driver::BuildCompilation(ArrayRef<const char *> ArgList) {
   return C;
 }
 
+static void printArgList(raw_ostream &OS, const llvm::opt::ArgList &Args) {
+  llvm::opt::ArgStringList ASL;
+  for (const auto *A : Args)
+    A->render(Args, ASL);
+
+  for (auto I = ASL.begin(), E = ASL.end(); I != E; ++I) {
+    if (I != ASL.begin())
+      OS << ' ';
+    Command::printArg(OS, *I, true);
+  }
+  OS << '\n';
+}
+
 // When clang crashes, produce diagnostic information including the fully
 // preprocessed source file(s).  Request that the developer attach the
 // diagnostic information to a bug report.
@@ -546,7 +639,9 @@ void Driver::generateCompilationDiagnostics(Compilation &C,
         << "Error generating run script: " + Script + " " + EC.message();
   } else {
     ScriptOS << "# Crash reproducer for " << getClangFullVersion() << "\n"
-             << "# Original command: ";
+             << "# Driver args: ";
+    printArgList(ScriptOS, C.getInputArgs());
+    ScriptOS << "# Original command: ";
     Cmd.Print(ScriptOS, "\n", /*Quote=*/true);
     Cmd.Print(ScriptOS, "\n", /*Quote=*/true, &CrashInfo);
     Diag(clang::diag::note_drv_command_failed_diag_msg) << Script;
@@ -787,14 +882,6 @@ bool Driver::HandleImmediateArgs(const Compilation &C) {
     }
     return false;
   }
-
-  if (C.getArgs().hasArg(options::OPT_print_multi_os_directory)) {
-    // FIXME: This should print out "lib/../lib", "lib/../lib64", or
-    // "lib/../lib32" as appropriate for the toolchain. For now, print
-    // nothing because it's not supported yet.
-    return false;
-  }
-
   return true;
 }
 
@@ -815,9 +902,20 @@ static unsigned PrintActions1(const Compilation &C, Action *A,
   } else if (BindArchAction *BIA = dyn_cast<BindArchAction>(A)) {
     os << '"' << BIA->getArchName() << '"' << ", {"
        << PrintActions1(C, *BIA->begin(), Ids) << "}";
+  } else if (CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
+    os << '"' << CDA->getGpuArchName() << '"' << ", {"
+       << PrintActions1(C, *CDA->begin(), Ids) << "}";
   } else {
+    ActionList *AL;
+    if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
+      os << "{" << PrintActions1(C, *CHA->begin(), Ids) << "}"
+         << ", gpu binaries ";
+      AL = &CHA->getDeviceActions();
+    } else
+      AL = &A->getInputs();
+
     const char *Prefix = "{";
-    for (Action *PreRequisite : *A) {
+    for (Action *PreRequisite : *AL) {
       os << Prefix << PrintActions1(C, PreRequisite, Ids);
       Prefix = ", ";
     }
@@ -1130,6 +1228,93 @@ void Driver::BuildInputs(const ToolChain &TC, DerivedArgList &Args,
   }
 }
 
+// For each unique --cuda-gpu-arch= argument creates a TY_CUDA_DEVICE input
+// action and then wraps each in CudaDeviceAction paired with appropriate GPU
+// arch name. If we're only building device-side code, each action remains
+// independent. Otherwise we pass device-side actions as inputs to a new
+// CudaHostAction which combines both host and device side actions.
+static std::unique_ptr<Action>
+buildCudaActions(const Driver &D, const ToolChain &TC, DerivedArgList &Args,
+                 const Arg *InputArg, const types::ID InputType,
+                 std::unique_ptr<Action> Current, ActionList &Actions) {
+
+  assert(InputType == types::TY_CUDA &&
+         "CUDA Actions only apply to CUDA inputs.");
+
+  // Collect all cuda_gpu_arch parameters, removing duplicates.
+  SmallVector<const char *, 4> GpuArchList;
+  llvm::StringSet<> GpuArchNames;
+  for (Arg *A : Args) {
+    if (A->getOption().matches(options::OPT_cuda_gpu_arch_EQ)) {
+      A->claim();
+      if (GpuArchNames.insert(A->getValue()).second)
+        GpuArchList.push_back(A->getValue());
+    }
+  }
+
+  // Default to sm_20 which is the lowest common denominator for supported GPUs.
+  // sm_20 code should work correctly, if suboptimally, on all newer GPUs.
+  if (GpuArchList.empty())
+    GpuArchList.push_back("sm_20");
+
+  // Replicate inputs for each GPU architecture.
+  Driver::InputList CudaDeviceInputs;
+  for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i)
+    CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg));
+
+  // Build actions for all device inputs.
+  ActionList CudaDeviceActions;
+  D.BuildActions(TC, Args, CudaDeviceInputs, CudaDeviceActions);
+  assert(GpuArchList.size() == CudaDeviceActions.size() &&
+         "Failed to create actions for all devices");
+
+  // Check whether any of device actions stopped before they could generate PTX.
+  bool PartialCompilation = false;
+  bool DeviceOnlyCompilation = Args.hasArg(options::OPT_cuda_device_only);
+  for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i) {
+    if (CudaDeviceActions[i]->getKind() != Action::BackendJobClass) {
+      PartialCompilation = true;
+      break;
+    }
+  }
+
+  // Figure out what to do with device actions -- pass them as inputs to the
+  // host action or run each of them independently.
+  if (PartialCompilation || DeviceOnlyCompilation) {
+    // In case of partial or device-only compilation results of device actions
+    // are not consumed by the host action device actions have to be added to
+    // top-level actions list with AtTopLevel=true and run independently.
+
+    // -o is ambiguous if we have more than one top-level action.
+    if (Args.hasArg(options::OPT_o) &&
+        (!DeviceOnlyCompilation || GpuArchList.size() > 1)) {
+      D.Diag(clang::diag::err_drv_output_argument_with_multiple_files);
+      return nullptr;
+    }
+
+    for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i)
+      Actions.push_back(
+          new CudaDeviceAction(std::unique_ptr<Action>(CudaDeviceActions[i]),
+                               GpuArchList[i], /* AtTopLevel */ true));
+    // Kill host action in case of device-only compilation.
+    if (DeviceOnlyCompilation)
+      Current.reset(nullptr);
+    return Current;
+  } else {
+    // Outputs of device actions during complete CUDA compilation get created
+    // with AtTopLevel=false and become inputs for the host action.
+    ActionList DeviceActions;
+    for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i)
+      DeviceActions.push_back(
+          new CudaDeviceAction(std::unique_ptr<Action>(CudaDeviceActions[i]),
+                               GpuArchList[i], /* AtTopLevel */ false));
+    // Return a new host action that incorporates original host action and all
+    // device actions.
+    return std::unique_ptr<Action>(
+        new CudaHostAction(std::move(Current), DeviceActions));
+  }
+}
+
 void Driver::BuildActions(const ToolChain &TC, DerivedArgList &Args,
                           const InputList &Inputs, ActionList &Actions) const {
   llvm::PrettyStackTraceString CrashInfo("Building compilation actions");
@@ -1227,6 +1412,25 @@ void Driver::BuildActions(const ToolChain &TC, DerivedArgList &Args,
       continue;
     }
 
+    phases::ID CudaInjectionPhase;
+    if (isSaveTempsEnabled()) {
+      // All phases are done independently, inject GPU blobs during compilation
+      // phase as that's where we generate glue code to init them.
+      CudaInjectionPhase = phases::Compile;
+    } else {
+      // Assumes that clang does everything up until linking phase, so we inject
+      // cuda device actions at the last step before linking. Otherwise CUDA
+      // host action forces preprocessor into a separate invocation.
+      if (FinalPhase == phases::Link) {
+        for (auto i = PL.begin(), e = PL.end(); i != e; ++i) {
+          auto next = i + 1;
+          if (next != e && *next == phases::Link)
+            CudaInjectionPhase = *i;
+        }
+      } else
+        CudaInjectionPhase = FinalPhase;
+    }
+
     // Build the pipeline for this file.
     std::unique_ptr<Action> Current(new InputAction(*InputArg, InputType));
     for (SmallVectorImpl<phases::ID>::iterator i = PL.begin(), e = PL.end();
@@ -1252,6 +1456,15 @@ void Driver::BuildActions(const ToolChain &TC, DerivedArgList &Args,
 
       // Otherwise construct the appropriate action.
       Current = ConstructPhaseAction(TC, Args, Phase, std::move(Current));
+
+      if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase &&
+          !Args.hasArg(options::OPT_cuda_host_only)) {
+        Current = buildCudaActions(*this, TC, Args, InputArg, InputType,
+                                   std::move(Current), Actions);
+        if (!Current)
+          break;
+      }
+
       if (Current->getType() == types::TY_Nothing)
         break;
     }
@@ -1491,7 +1704,13 @@ static const Tool *SelectToolForJob(Compilation &C, bool SaveTemps,
   if (isa<BackendJobAction>(JA)) {
     // Check if the compiler supports emitting LLVM IR.
     assert(Inputs->size() == 1);
-    JobAction *CompileJA = cast<CompileJobAction>(*Inputs->begin());
+    JobAction *CompileJA;
+    // Extract real host action, if it's a CudaHostAction.
+    if (CudaHostAction *CudaHA = dyn_cast<CudaHostAction>(*Inputs->begin()))
+      CompileJA = cast<CompileJobAction>(*CudaHA->begin());
+    else
+      CompileJA = cast<CompileJobAction>(*Inputs->begin());
+
     const Tool *Compiler = TC->SelectTool(*CompileJA);
     if (!Compiler)
       return nullptr;
@@ -1525,6 +1744,20 @@ void Driver::BuildJobsForAction(Compilation &C, const Action *A,
                                 InputInfo &Result) const {
   llvm::PrettyStackTraceString CrashInfo("Building compilation jobs");
 
+  InputInfoList CudaDeviceInputInfos;
+  if (const CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
+    InputInfo II;
+    // Append outputs of device jobs to the input list.
+    for (const Action *DA : CHA->getDeviceActions()) {
+      BuildJobsForAction(C, DA, TC, "", AtTopLevel,
+                         /*MultipleArchs*/ false, LinkingOutput, II);
+      CudaDeviceInputInfos.push_back(II);
+    }
+    // Override current action with a real host compile action and continue
+    // processing it.
+    A = *CHA->begin();
+  }
+
   if (const InputAction *IA = dyn_cast<InputAction>(A)) {
     // FIXME: It would be nice to not claim this here; maybe the old scheme of
     // just using Args was better?
@@ -1544,15 +1777,30 @@ void Driver::BuildJobsForAction(Compilation &C, const Action *A,
     const char *ArchName = BAA->getArchName();
 
     if (ArchName)
-      TC = &getToolChain(C.getArgs(), ArchName);
+      TC = &getToolChain(
+          C.getArgs(),
+          computeTargetTriple(DefaultTargetTriple, C.getArgs(), ArchName));
     else
       TC = &C.getDefaultToolChain();
 
-    BuildJobsForAction(C, *BAA->begin(), TC, BAA->getArchName(), AtTopLevel,
+    BuildJobsForAction(C, *BAA->begin(), TC, ArchName, AtTopLevel,
                        MultipleArchs, LinkingOutput, Result);
     return;
   }
 
+  if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
+    // Figure out which NVPTX triple to use for device-side compilation based on
+    // whether host is 64-bit.
+    llvm::Triple DeviceTriple(C.getDefaultToolChain().getTriple().isArch64Bit()
+                                  ? "nvptx64-nvidia-cuda"
+                                  : "nvptx-nvidia-cuda");
+    BuildJobsForAction(C, *CDA->begin(),
+                       &getToolChain(C.getArgs(), DeviceTriple),
+                       CDA->getGpuArchName(), CDA->isAtTopLevel(),
+                       /*MultipleArchs*/ true, LinkingOutput, Result);
+    return;
+  }
+
   const ActionList *Inputs = &A->getInputs();
 
   const JobAction *JA = cast<JobAction>(A);
@@ -1584,6 +1832,10 @@ void Driver::BuildJobsForAction(Compilation &C, const Action *A,
   if (JA->getType() == types::TY_dSYM)
     BaseInput = InputInfos[0].getFilename();
 
+  // Append outputs of cuda device jobs to the input list
+  if (CudaDeviceInputInfos.size())
+    InputInfos.append(CudaDeviceInputInfos.begin(), CudaDeviceInputInfos.end());
+
   // Determine the place to write output to, if any.
   if (JA->getType() == types::TY_Nothing)
     Result = InputInfo(A->getType(), BaseInput);
@@ -1899,93 +2151,8 @@ std::string Driver::GetTemporaryPath(StringRef Prefix,
   return Path.str();
 }
 
-/// \brief Compute target triple from args.
-///
-/// This routine provides the logic to compute a target triple from various
-/// args passed to the driver and the default triple string.
-static llvm::Triple computeTargetTriple(StringRef DefaultTargetTriple,
-                                        const ArgList &Args,
-                                        StringRef DarwinArchName) {
-  // FIXME: Already done in Compilation *Driver::BuildCompilation
-  if (const Arg *A = Args.getLastArg(options::OPT_target))
-    DefaultTargetTriple = A->getValue();
-
-  llvm::Triple Target(llvm::Triple::normalize(DefaultTargetTriple));
-
-  // Handle Apple-specific options available here.
-  if (Target.isOSBinFormatMachO()) {
-    // If an explict Darwin arch name is given, that trumps all.
-    if (!DarwinArchName.empty()) {
-      tools::darwin::setTripleTypeForMachOArchName(Target, DarwinArchName);
-      return Target;
-    }
-
-    // Handle the Darwin '-arch' flag.
-    if (Arg *A = Args.getLastArg(options::OPT_arch)) {
-      StringRef ArchName = A->getValue();
-      tools::darwin::setTripleTypeForMachOArchName(Target, ArchName);
-    }
-  }
-
-  // Handle pseudo-target flags '-mlittle-endian'/'-EL' and
-  // '-mbig-endian'/'-EB'.
-  if (Arg *A = Args.getLastArg(options::OPT_mlittle_endian,
-                               options::OPT_mbig_endian)) {
-    if (A->getOption().matches(options::OPT_mlittle_endian)) {
-      if (Target.getArch() == llvm::Triple::mips)
-        Target.setArch(llvm::Triple::mipsel);
-      else if (Target.getArch() == llvm::Triple::mips64)
-        Target.setArch(llvm::Triple::mips64el);
-      else if (Target.getArch() == llvm::Triple::aarch64_be)
-        Target.setArch(llvm::Triple::aarch64);
-    } else {
-      if (Target.getArch() == llvm::Triple::mipsel)
-        Target.setArch(llvm::Triple::mips);
-      else if (Target.getArch() == llvm::Triple::mips64el)
-        Target.setArch(llvm::Triple::mips64);
-      else if (Target.getArch() == llvm::Triple::aarch64)
-        Target.setArch(llvm::Triple::aarch64_be);
-    }
-  }
-
-  // Skip further flag support on OSes which don't support '-m32' or '-m64'.
-  if (Target.getArchName() == "tce" || Target.getOS() == llvm::Triple::Minix)
-    return Target;
-
-  // Handle pseudo-target flags '-m64', '-mx32', '-m32' and '-m16'.
-  if (Arg *A = Args.getLastArg(options::OPT_m64, options::OPT_mx32,
-                               options::OPT_m32, options::OPT_m16)) {
-    llvm::Triple::ArchType AT = llvm::Triple::UnknownArch;
-
-    if (A->getOption().matches(options::OPT_m64)) {
-      AT = Target.get64BitArchVariant().getArch();
-      if (Target.getEnvironment() == llvm::Triple::GNUX32)
-        Target.setEnvironment(llvm::Triple::GNU);
-    } else if (A->getOption().matches(options::OPT_mx32) &&
-               Target.get64BitArchVariant().getArch() == llvm::Triple::x86_64) {
-      AT = llvm::Triple::x86_64;
-      Target.setEnvironment(llvm::Triple::GNUX32);
-    } else if (A->getOption().matches(options::OPT_m32)) {
-      AT = Target.get32BitArchVariant().getArch();
-      if (Target.getEnvironment() == llvm::Triple::GNUX32)
-        Target.setEnvironment(llvm::Triple::GNU);
-    } else if (A->getOption().matches(options::OPT_m16) &&
-               Target.get32BitArchVariant().getArch() == llvm::Triple::x86) {
-      AT = llvm::Triple::x86;
-      Target.setEnvironment(llvm::Triple::CODE16);
-    }
-
-    if (AT != llvm::Triple::UnknownArch && AT != Target.getArch())
-      Target.setArch(AT);
-  }
-
-  return Target;
-}
-
 const ToolChain &Driver::getToolChain(const ArgList &Args,
-                                      StringRef DarwinArchName) const {
-  llvm::Triple Target =
-      computeTargetTriple(DefaultTargetTriple, Args, DarwinArchName);
+                                      const llvm::Triple &Target) const {
 
   ToolChain *&TC = ToolChains[Target.str()];
   if (!TC) {
@@ -2050,6 +2217,9 @@ const ToolChain &Driver::getToolChain(const ArgList &Args,
         break;
       }
       break;
+    case llvm::Triple::CUDA:
+      TC = new toolchains::CudaToolChain(*this, Target, Args);
+      break;
     default:
       // Of these targets, Hexagon is the only one that might have
       // an OS of Linux, in which case it got handled above already.