summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp')
-rw-r--r--contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp396
1 files changed, 396 insertions, 0 deletions
diff --git a/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
new file mode 100644
index 0000000..d64f6df
--- /dev/null
+++ b/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -0,0 +1,396 @@
+//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides a class for OpenMP runtime code generation specialized to NVPTX
+// targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CGOpenMPRuntimeNVPTX.h"
+#include "clang/AST/DeclOpenMP.h"
+#include "CodeGenFunction.h"
+#include "clang/AST/StmtOpenMP.h"
+
+using namespace clang;
+using namespace CodeGen;
+
+/// \brief Get the GPU warp size.
+llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXWarpSize(CodeGenFunction &CGF) {
+ CGBuilderTy &Bld = CGF.Builder;
+ return Bld.CreateCall(
+ llvm::Intrinsic::getDeclaration(
+ &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
+ llvm::None, "nvptx_warp_size");
+}
+
+/// \brief Get the id of the current thread on the GPU.
+llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXThreadID(CodeGenFunction &CGF) {
+ CGBuilderTy &Bld = CGF.Builder;
+ return Bld.CreateCall(
+ llvm::Intrinsic::getDeclaration(
+ &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
+ llvm::None, "nvptx_tid");
+}
+
+// \brief Get the maximum number of threads in a block of the GPU.
+llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXNumThreads(CodeGenFunction &CGF) {
+ CGBuilderTy &Bld = CGF.Builder;
+ return Bld.CreateCall(
+ llvm::Intrinsic::getDeclaration(
+ &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
+ llvm::None, "nvptx_num_threads");
+}
+
+/// \brief Get barrier to synchronize all threads in a block.
+void CGOpenMPRuntimeNVPTX::getNVPTXCTABarrier(CodeGenFunction &CGF) {
+ CGBuilderTy &Bld = CGF.Builder;
+ Bld.CreateCall(llvm::Intrinsic::getDeclaration(
+ &CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
+}
+
+// \brief Synchronize all GPU threads in a block.
+void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) {
+ getNVPTXCTABarrier(CGF);
+}
+
+/// \brief Get the thread id of the OMP master thread.
+/// The master thread id is the first thread (lane) of the last warp in the
+/// GPU block. Warp size is assumed to be some power of 2.
+/// Thread id is 0 indexed.
+/// E.g: If NumThreads is 33, master id is 32.
+/// If NumThreads is 64, master id is 32.
+/// If NumThreads is 1024, master id is 992.
+llvm::Value *CGOpenMPRuntimeNVPTX::getMasterThreadID(CodeGenFunction &CGF) {
+ CGBuilderTy &Bld = CGF.Builder;
+ llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
+
+ // We assume that the warp size is a power of 2.
+ llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
+
+ return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
+ Bld.CreateNot(Mask), "master_tid");
+}
+
+namespace {
+enum OpenMPRTLFunctionNVPTX {
+ /// \brief Call to void __kmpc_kernel_init(kmp_int32 omp_handle,
+ /// kmp_int32 thread_limit);
+ OMPRTL_NVPTX__kmpc_kernel_init,
+};
+
+// NVPTX Address space
+enum ADDRESS_SPACE {
+ ADDRESS_SPACE_SHARED = 3,
+};
+} // namespace
+
+CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
+ CodeGenModule &CGM)
+ : WorkerFn(nullptr), CGFI(nullptr) {
+ createWorkerFunction(CGM);
+}
+
+void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
+ CodeGenModule &CGM) {
+ // Create an worker function with no arguments.
+ CGFI = &CGM.getTypes().arrangeNullaryFunction();
+
+ WorkerFn = llvm::Function::Create(
+ CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage,
+ /* placeholder */ "_worker", &CGM.getModule());
+ CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI);
+ WorkerFn->setLinkage(llvm::GlobalValue::InternalLinkage);
+ WorkerFn->addFnAttr(llvm::Attribute::NoInline);
+}
+
+void CGOpenMPRuntimeNVPTX::initializeEnvironment() {
+ //
+ // Initialize master-worker control state in shared memory.
+ //
+
+ auto DL = CGM.getDataLayout();
+ ActiveWorkers = new llvm::GlobalVariable(
+ CGM.getModule(), CGM.Int32Ty, /*isConstant=*/false,
+ llvm::GlobalValue::CommonLinkage,
+ llvm::Constant::getNullValue(CGM.Int32Ty), "__omp_num_threads", 0,
+ llvm::GlobalVariable::NotThreadLocal, ADDRESS_SPACE_SHARED);
+ ActiveWorkers->setAlignment(DL.getPrefTypeAlignment(CGM.Int32Ty));
+
+ WorkID = new llvm::GlobalVariable(
+ CGM.getModule(), CGM.Int64Ty, /*isConstant=*/false,
+ llvm::GlobalValue::CommonLinkage,
+ llvm::Constant::getNullValue(CGM.Int64Ty), "__tgt_work_id", 0,
+ llvm::GlobalVariable::NotThreadLocal, ADDRESS_SPACE_SHARED);
+ WorkID->setAlignment(DL.getPrefTypeAlignment(CGM.Int64Ty));
+}
+
+void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
+ auto &Ctx = CGM.getContext();
+
+ CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
+ CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {});
+ emitWorkerLoop(CGF, WST);
+ CGF.FinishFunction();
+}
+
+void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
+ WorkerFunctionState &WST) {
+ //
+ // The workers enter this loop and wait for parallel work from the master.
+ // When the master encounters a parallel region it sets up the work + variable
+ // arguments, and wakes up the workers. The workers first check to see if
+ // they are required for the parallel region, i.e., within the # of requested
+ // parallel threads. The activated workers load the variable arguments and
+ // execute the parallel work.
+ //
+
+ CGBuilderTy &Bld = CGF.Builder;
+
+ llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
+ llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
+ llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
+ llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
+ llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
+ llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
+
+ CGF.EmitBranch(AwaitBB);
+
+ // Workers wait for work from master.
+ CGF.EmitBlock(AwaitBB);
+ // Wait for parallel work
+ syncCTAThreads(CGF);
+ // On termination condition (workid == 0), exit loop.
+ llvm::Value *ShouldTerminate = Bld.CreateICmpEQ(
+ Bld.CreateAlignedLoad(WorkID, WorkID->getAlignment()),
+ llvm::Constant::getNullValue(WorkID->getType()->getElementType()),
+ "should_terminate");
+ Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
+
+ // Activate requested workers.
+ CGF.EmitBlock(SelectWorkersBB);
+ llvm::Value *ThreadID = getNVPTXThreadID(CGF);
+ llvm::Value *ActiveThread = Bld.CreateICmpSLT(
+ ThreadID,
+ Bld.CreateAlignedLoad(ActiveWorkers, ActiveWorkers->getAlignment()),
+ "active_thread");
+ Bld.CreateCondBr(ActiveThread, ExecuteBB, BarrierBB);
+
+ // Signal start of parallel region.
+ CGF.EmitBlock(ExecuteBB);
+ // TODO: Add parallel work.
+
+ // Signal end of parallel region.
+ CGF.EmitBlock(TerminateBB);
+ CGF.EmitBranch(BarrierBB);
+
+ // All active and inactive workers wait at a barrier after parallel region.
+ CGF.EmitBlock(BarrierBB);
+ // Barrier after parallel region.
+ syncCTAThreads(CGF);
+ CGF.EmitBranch(AwaitBB);
+
+ // Exit target region.
+ CGF.EmitBlock(ExitBB);
+}
+
+// Setup NVPTX threads for master-worker OpenMP scheme.
+void CGOpenMPRuntimeNVPTX::emitEntryHeader(CodeGenFunction &CGF,
+ EntryFunctionState &EST,
+ WorkerFunctionState &WST) {
+ CGBuilderTy &Bld = CGF.Builder;
+
+ // Get the master thread id.
+ llvm::Value *MasterID = getMasterThreadID(CGF);
+ // Current thread's identifier.
+ llvm::Value *ThreadID = getNVPTXThreadID(CGF);
+
+ // Setup BBs in entry function.
+ llvm::BasicBlock *WorkerCheckBB = CGF.createBasicBlock(".check.for.worker");
+ llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
+ llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
+ EST.ExitBB = CGF.createBasicBlock(".exit");
+
+ // The head (master thread) marches on while its body of companion threads in
+ // the warp go to sleep.
+ llvm::Value *ShouldDie =
+ Bld.CreateICmpUGT(ThreadID, MasterID, "excess_in_master_warp");
+ Bld.CreateCondBr(ShouldDie, EST.ExitBB, WorkerCheckBB);
+
+ // Select worker threads...
+ CGF.EmitBlock(WorkerCheckBB);
+ llvm::Value *IsWorker = Bld.CreateICmpULT(ThreadID, MasterID, "is_worker");
+ Bld.CreateCondBr(IsWorker, WorkerBB, MasterBB);
+
+ // ... and send to worker loop, awaiting parallel invocation.
+ CGF.EmitBlock(WorkerBB);
+ CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None);
+ CGF.EmitBranch(EST.ExitBB);
+
+ // Only master thread executes subsequent serial code.
+ CGF.EmitBlock(MasterBB);
+
+ // First action in sequential region:
+ // Initialize the state of the OpenMP runtime library on the GPU.
+ llvm::Value *Args[] = {Bld.getInt32(/*OmpHandle=*/0), getNVPTXThreadID(CGF)};
+ CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init),
+ Args);
+}
+
+void CGOpenMPRuntimeNVPTX::emitEntryFooter(CodeGenFunction &CGF,
+ EntryFunctionState &EST) {
+ CGBuilderTy &Bld = CGF.Builder;
+ llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
+ CGF.EmitBranch(TerminateBB);
+
+ CGF.EmitBlock(TerminateBB);
+ // Signal termination condition.
+ Bld.CreateAlignedStore(
+ llvm::Constant::getNullValue(WorkID->getType()->getElementType()), WorkID,
+ WorkID->getAlignment());
+ // Barrier to terminate worker threads.
+ syncCTAThreads(CGF);
+ // Master thread jumps to exit point.
+ CGF.EmitBranch(EST.ExitBB);
+
+ CGF.EmitBlock(EST.ExitBB);
+}
+
+/// \brief Returns specified OpenMP runtime function for the current OpenMP
+/// implementation. Specialized for the NVPTX device.
+/// \param Function OpenMP runtime function.
+/// \return Specified function.
+llvm::Constant *
+CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
+ llvm::Constant *RTLFn = nullptr;
+ switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
+ case OMPRTL_NVPTX__kmpc_kernel_init: {
+ // Build void __kmpc_kernel_init(kmp_int32 omp_handle,
+ // kmp_int32 thread_limit);
+ llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int32Ty};
+ llvm::FunctionType *FnTy =
+ llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+ RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
+ break;
+ }
+ }
+ return RTLFn;
+}
+
+void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
+ llvm::Constant *Addr,
+ uint64_t Size) {
+ auto *F = dyn_cast<llvm::Function>(Addr);
+ // TODO: Add support for global variables on the device after declare target
+ // support.
+ if (!F)
+ return;
+ llvm::Module *M = F->getParent();
+ llvm::LLVMContext &Ctx = M->getContext();
+
+ // Get "nvvm.annotations" metadata node
+ llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
+
+ llvm::Metadata *MDVals[] = {
+ llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"),
+ llvm::ConstantAsMetadata::get(
+ llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
+ // Append metadata to nvvm.annotations
+ MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
+}
+
+void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
+ const OMPExecutableDirective &D, StringRef ParentName,
+ llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+ bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+ if (!IsOffloadEntry) // Nothing to do.
+ return;
+
+ assert(!ParentName.empty() && "Invalid target region parent name!");
+
+ EntryFunctionState EST;
+ WorkerFunctionState WST(CGM);
+
+ // Emit target region as a standalone region.
+ class NVPTXPrePostActionTy : public PrePostActionTy {
+ CGOpenMPRuntimeNVPTX &RT;
+ CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
+ CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
+
+ public:
+ NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
+ CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
+ CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
+ : RT(RT), EST(EST), WST(WST) {}
+ void Enter(CodeGenFunction &CGF) override {
+ RT.emitEntryHeader(CGF, EST, WST);
+ }
+ void Exit(CodeGenFunction &CGF) override { RT.emitEntryFooter(CGF, EST); }
+ } Action(*this, EST, WST);
+ CodeGen.setAction(Action);
+ emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
+ IsOffloadEntry, CodeGen);
+
+ // Create the worker function
+ emitWorkerFunction(WST);
+
+ // Now change the name of the worker function to correspond to this target
+ // region's entry function.
+ WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
+}
+
+CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
+ : CGOpenMPRuntime(CGM), ActiveWorkers(nullptr), WorkID(nullptr) {
+ if (!CGM.getLangOpts().OpenMPIsDevice)
+ llvm_unreachable("OpenMP NVPTX can only handle device code.");
+
+ // Called once per module during initialization.
+ initializeEnvironment();
+}
+
+void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
+ const Expr *NumTeams,
+ const Expr *ThreadLimit,
+ SourceLocation Loc) {}
+
+llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOrTeamsOutlinedFunction(
+ const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
+ OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
+
+ llvm::Function *OutlinedFun = nullptr;
+ if (isa<OMPTeamsDirective>(D)) {
+ llvm::Value *OutlinedFunVal =
+ CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction(
+ D, ThreadIDVar, InnermostKind, CodeGen);
+ OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
+ OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
+ } else
+ llvm_unreachable("parallel directive is not yet supported for nvptx "
+ "backend.");
+
+ return OutlinedFun;
+}
+
+void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
+ const OMPExecutableDirective &D,
+ SourceLocation Loc,
+ llvm::Value *OutlinedFn,
+ ArrayRef<llvm::Value *> CapturedVars) {
+ if (!CGF.HaveInsertPoint())
+ return;
+
+ Address ZeroAddr =
+ CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
+ /*Name*/ ".zero.addr");
+ CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
+ llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
+ OutlinedFnArgs.push_back(ZeroAddr.getPointer());
+ OutlinedFnArgs.push_back(ZeroAddr.getPointer());
+ OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
+ CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
+}
OpenPOWER on IntegriCloud