1 files changed, 475 insertions, 47 deletions
diff --git a/contrib/llvm/tools/clang/lib/Sema/SemaCUDA.cpp b/contrib/llvm/tools/clang/lib/Sema/SemaCUDA.cpp
index 90af6d5..282633b 100644
--- a/contrib/llvm/tools/clang/lib/Sema/SemaCUDA.cpp
+++ b/contrib/llvm/tools/clang/lib/Sema/SemaCUDA.cpp
@@ -18,11 +18,25 @@
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/Sema.h"
 #include "clang/Sema/SemaDiagnostic.h"
+#include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/Template.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 using namespace clang;
 
+void Sema::PushForceCUDAHostDevice() {
+  assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
+  ForceCUDAHostDeviceDepth++;
+}
+
+bool Sema::PopForceCUDAHostDevice() {
+  assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
+  if (ForceCUDAHostDeviceDepth == 0)
+    return false;
+  ForceCUDAHostDeviceDepth--;
+  return true;
+}
+
 ExprResult Sema::ActOnCUDAExecConfigExpr(Scope *S, SourceLocation LLLLoc,
                                          MultiExprArg ExecConfig,
                                          SourceLocation GGGLoc) {
@@ -40,21 +54,73 @@ ExprResult Sema::ActOnCUDAExecConfigExpr(Scope *S, SourceLocation LLLLoc,
                        /*IsExecConfig=*/true);
 }
 
+Sema::CUDAFunctionTarget Sema::IdentifyCUDATarget(const AttributeList *Attr) {
+  bool HasHostAttr = false;
+  bool HasDeviceAttr = false;
+  bool HasGlobalAttr = false;
+  bool HasInvalidTargetAttr = false;
+  while (Attr) {
+    switch(Attr->getKind()){
+    case AttributeList::AT_CUDAGlobal:
+      HasGlobalAttr = true;
+      break;
+    case AttributeList::AT_CUDAHost:
+      HasHostAttr = true;
+      break;
+    case AttributeList::AT_CUDADevice:
+      HasDeviceAttr = true;
+      break;
+    case AttributeList::AT_CUDAInvalidTarget:
+      HasInvalidTargetAttr = true;
+      break;
+    default:
+      break;
+    }
+    Attr = Attr->getNext();
+  }
+  if (HasInvalidTargetAttr)
+    return CFT_InvalidTarget;
+
+  if (HasGlobalAttr)
+    return CFT_Global;
+
+  if (HasHostAttr && HasDeviceAttr)
+    return CFT_HostDevice;
+
+  if (HasDeviceAttr)
+    return CFT_Device;
+
+  return CFT_Host;
+}
+
+template <typename A>
+static bool hasAttr(const FunctionDecl *D, bool IgnoreImplicitAttr) {
+  return D->hasAttrs() && llvm::any_of(D->getAttrs(), [&](Attr *Attribute) {
+           return isa<A>(Attribute) &&
+                  !(IgnoreImplicitAttr && Attribute->isImplicit());
+         });
+}
+
 /// IdentifyCUDATarget - Determine the CUDA compilation target for this function
-Sema::CUDAFunctionTarget Sema::IdentifyCUDATarget(const FunctionDecl *D) {
+Sema::CUDAFunctionTarget Sema::IdentifyCUDATarget(const FunctionDecl *D,
+                                                  bool IgnoreImplicitHDAttr) {
+  // Code that lives outside a function is run on the host.
+  if (D == nullptr)
+    return CFT_Host;
+
   if (D->hasAttr<CUDAInvalidTargetAttr>())
     return CFT_InvalidTarget;
 
   if (D->hasAttr<CUDAGlobalAttr>())
     return CFT_Global;
 
-  if (D->hasAttr<CUDADeviceAttr>()) {
-    if (D->hasAttr<CUDAHostAttr>())
+  if (hasAttr<CUDADeviceAttr>(D, IgnoreImplicitHDAttr)) {
+    if (hasAttr<CUDAHostAttr>(D, IgnoreImplicitHDAttr))
       return CFT_HostDevice;
     return CFT_Device;
-  } else if (D->hasAttr<CUDAHostAttr>()) {
+  } else if (hasAttr<CUDAHostAttr>(D, IgnoreImplicitHDAttr)) {
     return CFT_Host;
-  } else if (D->isImplicit()) {
+  } else if (D->isImplicit() && !IgnoreImplicitHDAttr) {
     // Some implicit declarations (like intrinsic functions) are not marked.
     // Set the most lenient target on them for maximal flexibility.
     return CFT_HostDevice;
@@ -95,9 +161,8 @@ Sema::CUDAFunctionPreference
 Sema::IdentifyCUDAPreference(const FunctionDecl *Caller,
                              const FunctionDecl *Callee) {
   assert(Callee && "Callee must be valid.");
+  CUDAFunctionTarget CallerTarget = IdentifyCUDATarget(Caller);
   CUDAFunctionTarget CalleeTarget = IdentifyCUDATarget(Callee);
-  CUDAFunctionTarget CallerTarget =
-      (Caller != nullptr) ? IdentifyCUDATarget(Caller) : Sema::CFT_Host;
 
   // If one of the targets is invalid, the check always fails, no matter what
   // the other target is.
@@ -107,8 +172,7 @@ Sema::IdentifyCUDAPreference(const FunctionDecl *Caller,
   // (a) Can't call global from some contexts until we support CUDA's
   // dynamic parallelism.
   if (CalleeTarget == CFT_Global &&
-      (CallerTarget == CFT_Global || CallerTarget == CFT_Device ||
-       (CallerTarget == CFT_HostDevice && getLangOpts().CUDAIsDevice)))
+      (CallerTarget == CFT_Global || CallerTarget == CFT_Device))
     return CFP_Never;
 
   // (b) Calling HostDevice is OK for everyone.
@@ -145,52 +209,27 @@ Sema::IdentifyCUDAPreference(const FunctionDecl *Caller,
   llvm_unreachable("All cases should've been handled by now.");
 }
 
-template <typename T>
-static void EraseUnwantedCUDAMatchesImpl(
-    Sema &S, const FunctionDecl *Caller, llvm::SmallVectorImpl<T> &Matches,
-    std::function<const FunctionDecl *(const T &)> FetchDecl) {
+void Sema::EraseUnwantedCUDAMatches(
+    const FunctionDecl *Caller,
+    SmallVectorImpl<std::pair<DeclAccessPair, FunctionDecl *>> &Matches) {
   if (Matches.size() <= 1)
     return;
 
+  using Pair = std::pair<DeclAccessPair, FunctionDecl*>;
+
   // Gets the CUDA function preference for a call from Caller to Match.
-  auto GetCFP = [&](const T &Match) {
-    return S.IdentifyCUDAPreference(Caller, FetchDecl(Match));
+  auto GetCFP = [&](const Pair &Match) {
+    return IdentifyCUDAPreference(Caller, Match.second);
   };
 
   // Find the best call preference among the functions in Matches.
-  Sema::CUDAFunctionPreference BestCFP = GetCFP(*std::max_element(
+  CUDAFunctionPreference BestCFP = GetCFP(*std::max_element(
       Matches.begin(), Matches.end(),
-      [&](const T &M1, const T &M2) { return GetCFP(M1) < GetCFP(M2); }));
+      [&](const Pair &M1, const Pair &M2) { return GetCFP(M1) < GetCFP(M2); }));
 
   // Erase all functions with lower priority.
-  Matches.erase(
-      llvm::remove_if(Matches,
-                      [&](const T &Match) { return GetCFP(Match) < BestCFP; }),
-      Matches.end());
-}
-
-void Sema::EraseUnwantedCUDAMatches(const FunctionDecl *Caller,
-                                    SmallVectorImpl<FunctionDecl *> &Matches){
-  EraseUnwantedCUDAMatchesImpl<FunctionDecl *>(
-      *this, Caller, Matches, [](const FunctionDecl *item) { return item; });
-}
-
-void Sema::EraseUnwantedCUDAMatches(const FunctionDecl *Caller,
-                                    SmallVectorImpl<DeclAccessPair> &Matches) {
-  EraseUnwantedCUDAMatchesImpl<DeclAccessPair>(
-      *this, Caller, Matches, [](const DeclAccessPair &item) {
-        return dyn_cast<FunctionDecl>(item.getDecl());
-      });
-}
-
-void Sema::EraseUnwantedCUDAMatches(
-    const FunctionDecl *Caller,
-    SmallVectorImpl<std::pair<DeclAccessPair, FunctionDecl *>> &Matches){
-  EraseUnwantedCUDAMatchesImpl<std::pair<DeclAccessPair, FunctionDecl *>>(
-      *this, Caller, Matches,
-      [](const std::pair<DeclAccessPair, FunctionDecl *> &item) {
-        return dyn_cast<FunctionDecl>(item.second);
-      });
+  llvm::erase_if(Matches,
+                 [&](const Pair &Match) { return GetCFP(Match) < BestCFP; });
 }
 
 /// When an implicitly-declared special member has to invoke more than one
@@ -441,9 +480,23 @@ bool Sema::isEmptyCudaDestructor(SourceLocation Loc, CXXDestructorDecl *DD) {
 //  * a __device__ function with this signature was already declared, in which
 //    case in which case we output an error, unless the __device__ decl is in a
 //    system header, in which case we leave the constexpr function unattributed.
-void Sema::maybeAddCUDAHostDeviceAttrs(Scope *S, FunctionDecl *NewD,
+//
+// In addition, all function decls are treated as __host__ __device__ when
+// ForceCUDAHostDeviceDepth > 0 (corresponding to code within a
+//   #pragma clang force_cuda_host_device_begin/end
+// pair).
+void Sema::maybeAddCUDAHostDeviceAttrs(FunctionDecl *NewD,
                                        const LookupResult &Previous) {
-  assert(getLangOpts().CUDA && "May be called only for CUDA compilations.");
+  assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
+
+  if (ForceCUDAHostDeviceDepth > 0) {
+    if (!NewD->hasAttr<CUDAHostAttr>())
+      NewD->addAttr(CUDAHostAttr::CreateImplicit(Context));
+    if (!NewD->hasAttr<CUDADeviceAttr>())
+      NewD->addAttr(CUDADeviceAttr::CreateImplicit(Context));
+    return;
+  }
+
   if (!getLangOpts().CUDAHostDeviceConstexpr || !NewD->isConstexpr() ||
       NewD->isVariadic() || NewD->hasAttr<CUDAHostAttr>() ||
       NewD->hasAttr<CUDADeviceAttr>() || NewD->hasAttr<CUDAGlobalAttr>())
@@ -480,3 +533,378 @@ void Sema::maybeAddCUDAHostDeviceAttrs(Scope *S, FunctionDecl *NewD,
   NewD->addAttr(CUDAHostAttr::CreateImplicit(Context));
   NewD->addAttr(CUDADeviceAttr::CreateImplicit(Context));
 }
+
+// In CUDA, there are some constructs which may appear in semantically-valid
+// code, but trigger errors if we ever generate code for the function in which
+// they appear.  Essentially every construct you're not allowed to use on the
+// device falls into this category, because you are allowed to use these
+// constructs in a __host__ __device__ function, but only if that function is
+// never codegen'ed on the device.
+//
+// To handle semantic checking for these constructs, we keep track of the set of
+// functions we know will be emitted, either because we could tell a priori that
+// they would be emitted, or because they were transitively called by a
+// known-emitted function.
+//
+// We also keep a partial call graph of which not-known-emitted functions call
+// which other not-known-emitted functions.
+//
+// When we see something which is illegal if the current function is emitted
+// (usually by way of CUDADiagIfDeviceCode, CUDADiagIfHostCode, or
+// CheckCUDACall), we first check if the current function is known-emitted.  If
+// so, we immediately output the diagnostic.
+//
+// Otherwise, we "defer" the diagnostic.  It sits in Sema::CUDADeferredDiags
+// until we discover that the function is known-emitted, at which point we take
+// it out of this map and emit the diagnostic.
+
+Sema::CUDADiagBuilder::CUDADiagBuilder(Kind K, SourceLocation Loc,
+                                       unsigned DiagID, FunctionDecl *Fn,
+                                       Sema &S)
+    : S(S), Loc(Loc), DiagID(DiagID), Fn(Fn),
+      ShowCallStack(K == K_ImmediateWithCallStack || K == K_Deferred) {
+  switch (K) {
+  case K_Nop:
+    break;
+  case K_Immediate:
+  case K_ImmediateWithCallStack:
+    ImmediateDiag.emplace(S.Diag(Loc, DiagID));
+    break;
+  case K_Deferred:
+    assert(Fn && "Must have a function to attach the deferred diag to.");
+    PartialDiag.emplace(S.PDiag(DiagID));
+    break;
+  }
+}
+
+// Print notes showing how we can reach FD starting from an a priori
+// known-callable function.
+static void EmitCallStackNotes(Sema &S, FunctionDecl *FD) {
+  auto FnIt = S.CUDAKnownEmittedFns.find(FD);
+  while (FnIt != S.CUDAKnownEmittedFns.end()) {
+    DiagnosticBuilder Builder(
+        S.Diags.Report(FnIt->second.Loc, diag::note_called_by));
+    Builder << FnIt->second.FD;
+    Builder.setForceEmit();
+
+    FnIt = S.CUDAKnownEmittedFns.find(FnIt->second.FD);
+  }
+}
+
+Sema::CUDADiagBuilder::~CUDADiagBuilder() {
+  if (ImmediateDiag) {
+    // Emit our diagnostic and, if it was a warning or error, output a callstack
+    // if Fn isn't a priori known-emitted.
+    bool IsWarningOrError = S.getDiagnostics().getDiagnosticLevel(
+                                DiagID, Loc) >= DiagnosticsEngine::Warning;
+    ImmediateDiag.reset(); // Emit the immediate diag.
+    if (IsWarningOrError && ShowCallStack)
+      EmitCallStackNotes(S, Fn);
+  } else if (PartialDiag) {
+    assert(ShowCallStack && "Must always show call stack for deferred diags.");
+    S.CUDADeferredDiags[Fn].push_back({Loc, std::move(*PartialDiag)});
+  }
+}
+
+// Do we know that we will eventually codegen the given function?
+static bool IsKnownEmitted(Sema &S, FunctionDecl *FD) {
+  // Templates are emitted when they're instantiated.
+  if (FD->isDependentContext())
+    return false;
+
+  // When compiling for device, host functions are never emitted.  Similarly,
+  // when compiling for host, device and global functions are never emitted.
+  // (Technically, we do emit a host-side stub for global functions, but this
+  // doesn't count for our purposes here.)
+  Sema::CUDAFunctionTarget T = S.IdentifyCUDATarget(FD);
+  if (S.getLangOpts().CUDAIsDevice && T == Sema::CFT_Host)
+    return false;
+  if (!S.getLangOpts().CUDAIsDevice &&
+      (T == Sema::CFT_Device || T == Sema::CFT_Global))
+    return false;
+
+  // Check whether this function is externally visible -- if so, it's
+  // known-emitted.
+  //
+  // We have to check the GVA linkage of the function's *definition* -- if we
+  // only have a declaration, we don't know whether or not the function will be
+  // emitted, because (say) the definition could include "inline".
+  FunctionDecl *Def = FD->getDefinition();
+
+  // We may currently be parsing the body of FD, in which case
+  // FD->getDefinition() will be null, but we still want to treat FD as though
+  // it's a definition.
+  if (!Def && FD->willHaveBody())
+    Def = FD;
+
+  if (Def &&
+      !isDiscardableGVALinkage(S.getASTContext().GetGVALinkageForFunction(Def)))
+    return true;
+
+  // Otherwise, the function is known-emitted if it's in our set of
+  // known-emitted functions.
+  return S.CUDAKnownEmittedFns.count(FD) > 0;
+}
+
+Sema::CUDADiagBuilder Sema::CUDADiagIfDeviceCode(SourceLocation Loc,
+                                                 unsigned DiagID) {
+  assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
+  CUDADiagBuilder::Kind DiagKind = [&] {
+    switch (CurrentCUDATarget()) {
+    case CFT_Global:
+    case CFT_Device:
+      return CUDADiagBuilder::K_Immediate;
+    case CFT_HostDevice:
+      // An HD function counts as host code if we're compiling for host, and
+      // device code if we're compiling for device.  Defer any errors in device
+      // mode until the function is known-emitted.
+      if (getLangOpts().CUDAIsDevice) {
+        return IsKnownEmitted(*this, dyn_cast<FunctionDecl>(CurContext))
+                   ? CUDADiagBuilder::K_ImmediateWithCallStack
+                   : CUDADiagBuilder::K_Deferred;
+      }
+      return CUDADiagBuilder::K_Nop;
+
+    default:
+      return CUDADiagBuilder::K_Nop;
+    }
+  }();
+  return CUDADiagBuilder(DiagKind, Loc, DiagID,
+                         dyn_cast<FunctionDecl>(CurContext), *this);
+}
+
+Sema::CUDADiagBuilder Sema::CUDADiagIfHostCode(SourceLocation Loc,
+                                               unsigned DiagID) {
+  assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
+  CUDADiagBuilder::Kind DiagKind = [&] {
+    switch (CurrentCUDATarget()) {
+    case CFT_Host:
+      return CUDADiagBuilder::K_Immediate;
+    case CFT_HostDevice:
+      // An HD function counts as host code if we're compiling for host, and
+      // device code if we're compiling for device.  Defer any errors in device
+      // mode until the function is known-emitted.
+      if (getLangOpts().CUDAIsDevice)
+        return CUDADiagBuilder::K_Nop;
+
+      return IsKnownEmitted(*this, dyn_cast<FunctionDecl>(CurContext))
+                 ? CUDADiagBuilder::K_ImmediateWithCallStack
+                 : CUDADiagBuilder::K_Deferred;
+    default:
+      return CUDADiagBuilder::K_Nop;
+    }
+  }();
+  return CUDADiagBuilder(DiagKind, Loc, DiagID,
+                         dyn_cast<FunctionDecl>(CurContext), *this);
+}
+
+// Emit any deferred diagnostics for FD and erase them from the map in which
+// they're stored.
+static void EmitDeferredDiags(Sema &S, FunctionDecl *FD) {
+  auto It = S.CUDADeferredDiags.find(FD);
+  if (It == S.CUDADeferredDiags.end())
+    return;
+  bool HasWarningOrError = false;
+  for (PartialDiagnosticAt &PDAt : It->second) {
+    const SourceLocation &Loc = PDAt.first;
+    const PartialDiagnostic &PD = PDAt.second;
+    HasWarningOrError |= S.getDiagnostics().getDiagnosticLevel(
+                             PD.getDiagID(), Loc) >= DiagnosticsEngine::Warning;
+    DiagnosticBuilder Builder(S.Diags.Report(Loc, PD.getDiagID()));
+    Builder.setForceEmit();
+    PD.Emit(Builder);
+  }
+  S.CUDADeferredDiags.erase(It);
+
+  // FIXME: Should this be called after every warning/error emitted in the loop
+  // above, instead of just once per function?  That would be consistent with
+  // how we handle immediate errors, but it also seems like a bit much.
+  if (HasWarningOrError)
+    EmitCallStackNotes(S, FD);
+}
+
+// Indicate that this function (and thus everything it transtively calls) will
+// be codegen'ed, and emit any deferred diagnostics on this function and its
+// (transitive) callees.
+static void MarkKnownEmitted(Sema &S, FunctionDecl *OrigCaller,
+                             FunctionDecl *OrigCallee, SourceLocation OrigLoc) {
+  // Nothing to do if we already know that FD is emitted.
+  if (IsKnownEmitted(S, OrigCallee)) {
+    assert(!S.CUDACallGraph.count(OrigCallee));
+    return;
+  }
+
+  // We've just discovered that OrigCallee is known-emitted.  Walk our call
+  // graph to see what else we can now discover also must be emitted.
+
+  struct CallInfo {
+    FunctionDecl *Caller;
+    FunctionDecl *Callee;
+    SourceLocation Loc;
+  };
+  llvm::SmallVector<CallInfo, 4> Worklist = {{OrigCaller, OrigCallee, OrigLoc}};
+  llvm::SmallSet<CanonicalDeclPtr<FunctionDecl>, 4> Seen;
+  Seen.insert(OrigCallee);
+  while (!Worklist.empty()) {
+    CallInfo C = Worklist.pop_back_val();
+    assert(!IsKnownEmitted(S, C.Callee) &&
+           "Worklist should not contain known-emitted functions.");
+    S.CUDAKnownEmittedFns[C.Callee] = {C.Caller, C.Loc};
+    EmitDeferredDiags(S, C.Callee);
+
+    // If this is a template instantiation, explore its callgraph as well:
+    // Non-dependent calls are part of the template's callgraph, while dependent
+    // calls are part of to the instantiation's call graph.
+    if (auto *Templ = C.Callee->getPrimaryTemplate()) {
+      FunctionDecl *TemplFD = Templ->getAsFunction();
+      if (!Seen.count(TemplFD) && !S.CUDAKnownEmittedFns.count(TemplFD)) {
+        Seen.insert(TemplFD);
+        Worklist.push_back(
+            {/* Caller = */ C.Caller, /* Callee = */ TemplFD, C.Loc});
+      }
+    }
+
+    // Add all functions called by Callee to our worklist.
+    auto CGIt = S.CUDACallGraph.find(C.Callee);
+    if (CGIt == S.CUDACallGraph.end())
+      continue;
+
+    for (std::pair<CanonicalDeclPtr<FunctionDecl>, SourceLocation> FDLoc :
+         CGIt->second) {
+      FunctionDecl *NewCallee = FDLoc.first;
+      SourceLocation CallLoc = FDLoc.second;
+      if (Seen.count(NewCallee) || IsKnownEmitted(S, NewCallee))
+        continue;
+      Seen.insert(NewCallee);
+      Worklist.push_back(
+          {/* Caller = */ C.Callee, /* Callee = */ NewCallee, CallLoc});
+    }
+
+    // C.Callee is now known-emitted, so we no longer need to maintain its list
+    // of callees in CUDACallGraph.
+    S.CUDACallGraph.erase(CGIt);
+  }
+}
+
+bool Sema::CheckCUDACall(SourceLocation Loc, FunctionDecl *Callee) {
+  assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
+  assert(Callee && "Callee may not be null.");
+  // FIXME: Is bailing out early correct here?  Should we instead assume that
+  // the caller is a global initializer?
+  FunctionDecl *Caller = dyn_cast<FunctionDecl>(CurContext);
+  if (!Caller)
+    return true;
+
+  // If the caller is known-emitted, mark the callee as known-emitted.
+  // Otherwise, mark the call in our call graph so we can traverse it later.
+  bool CallerKnownEmitted = IsKnownEmitted(*this, Caller);
+  if (CallerKnownEmitted)
+    MarkKnownEmitted(*this, Caller, Callee, Loc);
+  else {
+    // If we have
+    //   host fn calls kernel fn calls host+device,
+    // the HD function does not get instantiated on the host.  We model this by
+    // omitting at the call to the kernel from the callgraph.  This ensures
+    // that, when compiling for host, only HD functions actually called from the
+    // host get marked as known-emitted.
+    if (getLangOpts().CUDAIsDevice || IdentifyCUDATarget(Callee) != CFT_Global)
+      CUDACallGraph[Caller].insert({Callee, Loc});
+  }
+
+  CUDADiagBuilder::Kind DiagKind = [&] {
+    switch (IdentifyCUDAPreference(Caller, Callee)) {
+    case CFP_Never:
+      return CUDADiagBuilder::K_Immediate;
+    case CFP_WrongSide:
+      assert(Caller && "WrongSide calls require a non-null caller");
+      // If we know the caller will be emitted, we know this wrong-side call
+      // will be emitted, so it's an immediate error.  Otherwise, defer the
+      // error until we know the caller is emitted.
+      return CallerKnownEmitted ? CUDADiagBuilder::K_ImmediateWithCallStack
+                                : CUDADiagBuilder::K_Deferred;
+    default:
+      return CUDADiagBuilder::K_Nop;
+    }
+  }();
+
+  if (DiagKind == CUDADiagBuilder::K_Nop)
+    return true;
+
+  // Avoid emitting this error twice for the same location.  Using a hashtable
+  // like this is unfortunate, but because we must continue parsing as normal
+  // after encountering a deferred error, it's otherwise very tricky for us to
+  // ensure that we only emit this deferred error once.
+  if (!LocsWithCUDACallDiags.insert({Caller, Loc}).second)
+    return true;
+
+  CUDADiagBuilder(DiagKind, Loc, diag::err_ref_bad_target, Caller, *this)
+      << IdentifyCUDATarget(Callee) << Callee << IdentifyCUDATarget(Caller);
+  CUDADiagBuilder(DiagKind, Callee->getLocation(), diag::note_previous_decl,
+                  Caller, *this)
+      << Callee;
+  return DiagKind != CUDADiagBuilder::K_Immediate &&
+         DiagKind != CUDADiagBuilder::K_ImmediateWithCallStack;
+}
+
+void Sema::CUDASetLambdaAttrs(CXXMethodDecl *Method) {
+  assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
+  if (Method->hasAttr<CUDAHostAttr>() || Method->hasAttr<CUDADeviceAttr>())
+    return;
+  FunctionDecl *CurFn = dyn_cast<FunctionDecl>(CurContext);
+  if (!CurFn)
+    return;
+  CUDAFunctionTarget Target = IdentifyCUDATarget(CurFn);
+  if (Target == CFT_Global || Target == CFT_Device) {
+    Method->addAttr(CUDADeviceAttr::CreateImplicit(Context));
+  } else if (Target == CFT_HostDevice) {
+    Method->addAttr(CUDADeviceAttr::CreateImplicit(Context));
+    Method->addAttr(CUDAHostAttr::CreateImplicit(Context));
+  }
+}
+
+void Sema::checkCUDATargetOverload(FunctionDecl *NewFD,
+                                   const LookupResult &Previous) {
+  assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
+  CUDAFunctionTarget NewTarget = IdentifyCUDATarget(NewFD);
+  for (NamedDecl *OldND : Previous) {
+    FunctionDecl *OldFD = OldND->getAsFunction();
+    if (!OldFD)
+      continue;
+
+    CUDAFunctionTarget OldTarget = IdentifyCUDATarget(OldFD);
+    // Don't allow HD and global functions to overload other functions with the
+    // same signature.  We allow overloading based on CUDA attributes so that
+    // functions can have different implementations on the host and device, but
+    // HD/global functions "exist" in some sense on both the host and device, so
+    // should have the same implementation on both sides.
+    if (NewTarget != OldTarget &&
+        ((NewTarget == CFT_HostDevice) || (OldTarget == CFT_HostDevice) ||
+         (NewTarget == CFT_Global) || (OldTarget == CFT_Global)) &&
+        !IsOverload(NewFD, OldFD, /* UseMemberUsingDeclRules = */ false,
+                    /* ConsiderCudaAttrs = */ false)) {
+      Diag(NewFD->getLocation(), diag::err_cuda_ovl_target)
+          << NewTarget << NewFD->getDeclName() << OldTarget << OldFD;
+      Diag(OldFD->getLocation(), diag::note_previous_declaration);
+      NewFD->setInvalidDecl();
+      break;
+    }
+  }
+}
+
+template <typename AttrTy>
+static void copyAttrIfPresent(Sema &S, FunctionDecl *FD,
+                              const FunctionDecl &TemplateFD) {
+  if (AttrTy *Attribute = TemplateFD.getAttr<AttrTy>()) {
+    AttrTy *Clone = Attribute->clone(S.Context);
+    Clone->setInherited(true);
+    FD->addAttr(Clone);
+  }
+}
+
+void Sema::inheritCUDATargetAttrs(FunctionDecl *FD,
+                                  const FunctionTemplateDecl &TD) {
+  const FunctionDecl &TemplateFD = *TD.getTemplatedDecl();
+  copyAttrIfPresent<CUDAGlobalAttr>(*this, FD, TemplateFD);
+  copyAttrIfPresent<CUDAHostAttr>(*this, FD, TemplateFD);
+  copyAttrIfPresent<CUDADeviceAttr>(*this, FD, TemplateFD);
+}