diff options
Diffstat (limited to 'contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp')
-rw-r--r-- | contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp | 1305 |
1 files changed, 1132 insertions, 173 deletions
diff --git a/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 4025217..d488bd4 100644 --- a/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -15,7 +15,7 @@ #include "CGCleanup.h" #include "CGOpenMPRuntime.h" #include "CodeGenFunction.h" -#include "ConstantBuilder.h" +#include "clang/CodeGen/ConstantInitBuilder.h" #include "clang/AST/Decl.h" #include "clang/AST/StmtOpenMP.h" #include "llvm/ADT/ArrayRef.h" @@ -643,6 +643,12 @@ enum OpenMPRTLFunction { // Call to void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid, kmp_int64 // *vec); OMPRTL__kmpc_doacross_wait, + // Call to void *__kmpc_task_reduction_init(int gtid, int num_data, void + // *data); + OMPRTL__kmpc_task_reduction_init, + // Call to void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void + // *d); + OMPRTL__kmpc_task_reduction_get_th_data, // // Offloading related calls @@ -697,6 +703,414 @@ void RegionCodeGenTy::operator()(CodeGenFunction &CGF) const { } } +/// Check if the combiner is a call to UDR combiner and if it is so return the +/// UDR decl used for reduction. +static const OMPDeclareReductionDecl * +getReductionInit(const Expr *ReductionOp) { + if (auto *CE = dyn_cast<CallExpr>(ReductionOp)) + if (auto *OVE = dyn_cast<OpaqueValueExpr>(CE->getCallee())) + if (auto *DRE = + dyn_cast<DeclRefExpr>(OVE->getSourceExpr()->IgnoreImpCasts())) + if (auto *DRD = dyn_cast<OMPDeclareReductionDecl>(DRE->getDecl())) + return DRD; + return nullptr; +} + +static void emitInitWithReductionInitializer(CodeGenFunction &CGF, + const OMPDeclareReductionDecl *DRD, + const Expr *InitOp, + Address Private, Address Original, + QualType Ty) { + if (DRD->getInitializer()) { + std::pair<llvm::Function *, llvm::Function *> Reduction = + CGF.CGM.getOpenMPRuntime().getUserDefinedReduction(DRD); + auto *CE = cast<CallExpr>(InitOp); + auto *OVE = cast<OpaqueValueExpr>(CE->getCallee()); + const Expr *LHS = CE->getArg(/*Arg=*/0)->IgnoreParenImpCasts(); + const Expr *RHS = CE->getArg(/*Arg=*/1)->IgnoreParenImpCasts(); + auto *LHSDRE = cast<DeclRefExpr>(cast<UnaryOperator>(LHS)->getSubExpr()); + auto *RHSDRE = cast<DeclRefExpr>(cast<UnaryOperator>(RHS)->getSubExpr()); + CodeGenFunction::OMPPrivateScope PrivateScope(CGF); + PrivateScope.addPrivate(cast<VarDecl>(LHSDRE->getDecl()), + [=]() -> Address { return Private; }); + PrivateScope.addPrivate(cast<VarDecl>(RHSDRE->getDecl()), + [=]() -> Address { return Original; }); + (void)PrivateScope.Privatize(); + RValue Func = RValue::get(Reduction.second); + CodeGenFunction::OpaqueValueMapping Map(CGF, OVE, Func); + CGF.EmitIgnoredExpr(InitOp); + } else { + llvm::Constant *Init = CGF.CGM.EmitNullConstant(Ty); + auto *GV = new llvm::GlobalVariable( + CGF.CGM.getModule(), Init->getType(), /*isConstant=*/true, + llvm::GlobalValue::PrivateLinkage, Init, ".init"); + LValue LV = CGF.MakeNaturalAlignAddrLValue(GV, Ty); + RValue InitRVal; + switch (CGF.getEvaluationKind(Ty)) { + case TEK_Scalar: + InitRVal = CGF.EmitLoadOfLValue(LV, SourceLocation()); + break; + case TEK_Complex: + InitRVal = + RValue::getComplex(CGF.EmitLoadOfComplex(LV, SourceLocation())); + break; + case TEK_Aggregate: + InitRVal = RValue::getAggregate(LV.getAddress()); + break; + } + OpaqueValueExpr OVE(SourceLocation(), Ty, VK_RValue); + CodeGenFunction::OpaqueValueMapping OpaqueMap(CGF, &OVE, InitRVal); + CGF.EmitAnyExprToMem(&OVE, Private, Ty.getQualifiers(), + /*IsInitializer=*/false); + } +} + +/// \brief Emit initialization of arrays of complex types. +/// \param DestAddr Address of the array. +/// \param Type Type of array. +/// \param Init Initial expression of array. +/// \param SrcAddr Address of the original array. +static void EmitOMPAggregateInit(CodeGenFunction &CGF, Address DestAddr, + QualType Type, const Expr *Init, + const OMPDeclareReductionDecl *DRD, + Address SrcAddr = Address::invalid()) { + // Perform element-by-element initialization. + QualType ElementTy; + + // Drill down to the base element type on both arrays. + auto ArrayTy = Type->getAsArrayTypeUnsafe(); + auto NumElements = CGF.emitArrayLength(ArrayTy, ElementTy, DestAddr); + DestAddr = + CGF.Builder.CreateElementBitCast(DestAddr, DestAddr.getElementType()); + if (DRD) + SrcAddr = + CGF.Builder.CreateElementBitCast(SrcAddr, DestAddr.getElementType()); + + llvm::Value *SrcBegin = nullptr; + if (DRD) + SrcBegin = SrcAddr.getPointer(); + auto DestBegin = DestAddr.getPointer(); + // Cast from pointer to array type to pointer to single element. + auto DestEnd = CGF.Builder.CreateGEP(DestBegin, NumElements); + // The basic structure here is a while-do loop. + auto BodyBB = CGF.createBasicBlock("omp.arrayinit.body"); + auto DoneBB = CGF.createBasicBlock("omp.arrayinit.done"); + auto IsEmpty = + CGF.Builder.CreateICmpEQ(DestBegin, DestEnd, "omp.arrayinit.isempty"); + CGF.Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB); + + // Enter the loop body, making that address the current address. + auto EntryBB = CGF.Builder.GetInsertBlock(); + CGF.EmitBlock(BodyBB); + + CharUnits ElementSize = CGF.getContext().getTypeSizeInChars(ElementTy); + + llvm::PHINode *SrcElementPHI = nullptr; + Address SrcElementCurrent = Address::invalid(); + if (DRD) { + SrcElementPHI = CGF.Builder.CreatePHI(SrcBegin->getType(), 2, + "omp.arraycpy.srcElementPast"); + SrcElementPHI->addIncoming(SrcBegin, EntryBB); + SrcElementCurrent = + Address(SrcElementPHI, + SrcAddr.getAlignment().alignmentOfArrayElement(ElementSize)); + } + llvm::PHINode *DestElementPHI = CGF.Builder.CreatePHI( + DestBegin->getType(), 2, "omp.arraycpy.destElementPast"); + DestElementPHI->addIncoming(DestBegin, EntryBB); + Address DestElementCurrent = + Address(DestElementPHI, + DestAddr.getAlignment().alignmentOfArrayElement(ElementSize)); + + // Emit copy. + { + CodeGenFunction::RunCleanupsScope InitScope(CGF); + if (DRD && (DRD->getInitializer() || !Init)) { + emitInitWithReductionInitializer(CGF, DRD, Init, DestElementCurrent, + SrcElementCurrent, ElementTy); + } else + CGF.EmitAnyExprToMem(Init, DestElementCurrent, ElementTy.getQualifiers(), + /*IsInitializer=*/false); + } + + if (DRD) { + // Shift the address forward by one element. + auto SrcElementNext = CGF.Builder.CreateConstGEP1_32( + SrcElementPHI, /*Idx0=*/1, "omp.arraycpy.dest.element"); + SrcElementPHI->addIncoming(SrcElementNext, CGF.Builder.GetInsertBlock()); + } + + // Shift the address forward by one element. + auto DestElementNext = CGF.Builder.CreateConstGEP1_32( + DestElementPHI, /*Idx0=*/1, "omp.arraycpy.dest.element"); + // Check whether we've reached the end. + auto Done = + CGF.Builder.CreateICmpEQ(DestElementNext, DestEnd, "omp.arraycpy.done"); + CGF.Builder.CreateCondBr(Done, DoneBB, BodyBB); + DestElementPHI->addIncoming(DestElementNext, CGF.Builder.GetInsertBlock()); + + // Done. + CGF.EmitBlock(DoneBB, /*IsFinished=*/true); +} + +LValue ReductionCodeGen::emitSharedLValue(CodeGenFunction &CGF, const Expr *E) { + if (const auto *OASE = dyn_cast<OMPArraySectionExpr>(E)) + return CGF.EmitOMPArraySectionExpr(OASE); + if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E)) + return CGF.EmitLValue(ASE); + auto *OrigVD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl()); + DeclRefExpr DRE(const_cast<VarDecl *>(OrigVD), + CGF.CapturedStmtInfo && + CGF.CapturedStmtInfo->lookup(OrigVD) != nullptr, + E->getType(), VK_LValue, E->getExprLoc()); + // Store the address of the original variable associated with the LHS + // implicit variable. + return CGF.EmitLValue(&DRE); +} + +LValue ReductionCodeGen::emitSharedLValueUB(CodeGenFunction &CGF, + const Expr *E) { + if (const auto *OASE = dyn_cast<OMPArraySectionExpr>(E)) + return CGF.EmitOMPArraySectionExpr(OASE, /*IsLowerBound=*/false); + return LValue(); +} + +void ReductionCodeGen::emitAggregateInitialization( + CodeGenFunction &CGF, unsigned N, Address PrivateAddr, LValue SharedLVal, + const OMPDeclareReductionDecl *DRD) { + // Emit VarDecl with copy init for arrays. + // Get the address of the original variable captured in current + // captured region. + auto *PrivateVD = + cast<VarDecl>(cast<DeclRefExpr>(ClausesData[N].Private)->getDecl()); + EmitOMPAggregateInit(CGF, PrivateAddr, PrivateVD->getType(), + DRD ? ClausesData[N].ReductionOp : PrivateVD->getInit(), + DRD, SharedLVal.getAddress()); +} + +ReductionCodeGen::ReductionCodeGen(ArrayRef<const Expr *> Shareds, + ArrayRef<const Expr *> Privates, + ArrayRef<const Expr *> ReductionOps) { + ClausesData.reserve(Shareds.size()); + SharedAddresses.reserve(Shareds.size()); + Sizes.reserve(Shareds.size()); + BaseDecls.reserve(Shareds.size()); + auto IPriv = Privates.begin(); + auto IRed = ReductionOps.begin(); + for (const auto *Ref : Shareds) { + ClausesData.emplace_back(Ref, *IPriv, *IRed); + std::advance(IPriv, 1); + std::advance(IRed, 1); + } +} + +void ReductionCodeGen::emitSharedLValue(CodeGenFunction &CGF, unsigned N) { + assert(SharedAddresses.size() == N && + "Number of generated lvalues must be exactly N."); + SharedAddresses.emplace_back(emitSharedLValue(CGF, ClausesData[N].Ref), + emitSharedLValueUB(CGF, ClausesData[N].Ref)); +} + +void ReductionCodeGen::emitAggregateType(CodeGenFunction &CGF, unsigned N) { + auto *PrivateVD = + cast<VarDecl>(cast<DeclRefExpr>(ClausesData[N].Private)->getDecl()); + QualType PrivateType = PrivateVD->getType(); + bool AsArraySection = isa<OMPArraySectionExpr>(ClausesData[N].Ref); + if (!AsArraySection && !PrivateType->isVariablyModifiedType()) { + Sizes.emplace_back( + CGF.getTypeSize( + SharedAddresses[N].first.getType().getNonReferenceType()), + nullptr); + return; + } + llvm::Value *Size; + llvm::Value *SizeInChars; + llvm::Type *ElemType = + cast<llvm::PointerType>(SharedAddresses[N].first.getPointer()->getType()) + ->getElementType(); + auto *ElemSizeOf = llvm::ConstantExpr::getSizeOf(ElemType); + if (AsArraySection) { + Size = CGF.Builder.CreatePtrDiff(SharedAddresses[N].second.getPointer(), + SharedAddresses[N].first.getPointer()); + Size = CGF.Builder.CreateNUWAdd( + Size, llvm::ConstantInt::get(Size->getType(), /*V=*/1)); + SizeInChars = CGF.Builder.CreateNUWMul(Size, ElemSizeOf); + } else { + SizeInChars = CGF.getTypeSize( + SharedAddresses[N].first.getType().getNonReferenceType()); + Size = CGF.Builder.CreateExactUDiv(SizeInChars, ElemSizeOf); + } + Sizes.emplace_back(SizeInChars, Size); + CodeGenFunction::OpaqueValueMapping OpaqueMap( + CGF, + cast<OpaqueValueExpr>( + CGF.getContext().getAsVariableArrayType(PrivateType)->getSizeExpr()), + RValue::get(Size)); + CGF.EmitVariablyModifiedType(PrivateType); +} + +void ReductionCodeGen::emitAggregateType(CodeGenFunction &CGF, unsigned N, + llvm::Value *Size) { + auto *PrivateVD = + cast<VarDecl>(cast<DeclRefExpr>(ClausesData[N].Private)->getDecl()); + QualType PrivateType = PrivateVD->getType(); + bool AsArraySection = isa<OMPArraySectionExpr>(ClausesData[N].Ref); + if (!AsArraySection && !PrivateType->isVariablyModifiedType()) { + assert(!Size && !Sizes[N].second && + "Size should be nullptr for non-variably modified redution " + "items."); + return; + } + CodeGenFunction::OpaqueValueMapping OpaqueMap( + CGF, + cast<OpaqueValueExpr>( + CGF.getContext().getAsVariableArrayType(PrivateType)->getSizeExpr()), + RValue::get(Size)); + CGF.EmitVariablyModifiedType(PrivateType); +} + +void ReductionCodeGen::emitInitialization( + CodeGenFunction &CGF, unsigned N, Address PrivateAddr, LValue SharedLVal, + llvm::function_ref<bool(CodeGenFunction &)> DefaultInit) { + assert(SharedAddresses.size() > N && "No variable was generated"); + auto *PrivateVD = + cast<VarDecl>(cast<DeclRefExpr>(ClausesData[N].Private)->getDecl()); + auto *DRD = getReductionInit(ClausesData[N].ReductionOp); + QualType PrivateType = PrivateVD->getType(); + PrivateAddr = CGF.Builder.CreateElementBitCast( + PrivateAddr, CGF.ConvertTypeForMem(PrivateType)); + QualType SharedType = SharedAddresses[N].first.getType(); + SharedLVal = CGF.MakeAddrLValue( + CGF.Builder.CreateElementBitCast(SharedLVal.getAddress(), + CGF.ConvertTypeForMem(SharedType)), + SharedType, SharedAddresses[N].first.getBaseInfo()); + if (isa<OMPArraySectionExpr>(ClausesData[N].Ref) || + CGF.getContext().getAsArrayType(PrivateVD->getType())) { + emitAggregateInitialization(CGF, N, PrivateAddr, SharedLVal, DRD); + } else if (DRD && (DRD->getInitializer() || !PrivateVD->hasInit())) { + emitInitWithReductionInitializer(CGF, DRD, ClausesData[N].ReductionOp, + PrivateAddr, SharedLVal.getAddress(), + SharedLVal.getType()); + } else if (!DefaultInit(CGF) && PrivateVD->hasInit() && + !CGF.isTrivialInitializer(PrivateVD->getInit())) { + CGF.EmitAnyExprToMem(PrivateVD->getInit(), PrivateAddr, + PrivateVD->getType().getQualifiers(), + /*IsInitializer=*/false); + } +} + +bool ReductionCodeGen::needCleanups(unsigned N) { + auto *PrivateVD = + cast<VarDecl>(cast<DeclRefExpr>(ClausesData[N].Private)->getDecl()); + QualType PrivateType = PrivateVD->getType(); + QualType::DestructionKind DTorKind = PrivateType.isDestructedType(); + return DTorKind != QualType::DK_none; +} + +void ReductionCodeGen::emitCleanups(CodeGenFunction &CGF, unsigned N, + Address PrivateAddr) { + auto *PrivateVD = + cast<VarDecl>(cast<DeclRefExpr>(ClausesData[N].Private)->getDecl()); + QualType PrivateType = PrivateVD->getType(); + QualType::DestructionKind DTorKind = PrivateType.isDestructedType(); + if (needCleanups(N)) { + PrivateAddr = CGF.Builder.CreateElementBitCast( + PrivateAddr, CGF.ConvertTypeForMem(PrivateType)); + CGF.pushDestroy(DTorKind, PrivateAddr, PrivateType); + } +} + +static LValue loadToBegin(CodeGenFunction &CGF, QualType BaseTy, QualType ElTy, + LValue BaseLV) { + BaseTy = BaseTy.getNonReferenceType(); + while ((BaseTy->isPointerType() || BaseTy->isReferenceType()) && + !CGF.getContext().hasSameType(BaseTy, ElTy)) { + if (auto *PtrTy = BaseTy->getAs<PointerType>()) + BaseLV = CGF.EmitLoadOfPointerLValue(BaseLV.getAddress(), PtrTy); + else { + BaseLV = CGF.EmitLoadOfReferenceLValue(BaseLV.getAddress(), + BaseTy->castAs<ReferenceType>()); + } + BaseTy = BaseTy->getPointeeType(); + } + return CGF.MakeAddrLValue( + CGF.Builder.CreateElementBitCast(BaseLV.getAddress(), + CGF.ConvertTypeForMem(ElTy)), + BaseLV.getType(), BaseLV.getBaseInfo()); +} + +static Address castToBase(CodeGenFunction &CGF, QualType BaseTy, QualType ElTy, + llvm::Type *BaseLVType, CharUnits BaseLVAlignment, + llvm::Value *Addr) { + Address Tmp = Address::invalid(); + Address TopTmp = Address::invalid(); + Address MostTopTmp = Address::invalid(); + BaseTy = BaseTy.getNonReferenceType(); + while ((BaseTy->isPointerType() || BaseTy->isReferenceType()) && + !CGF.getContext().hasSameType(BaseTy, ElTy)) { + Tmp = CGF.CreateMemTemp(BaseTy); + if (TopTmp.isValid()) + CGF.Builder.CreateStore(Tmp.getPointer(), TopTmp); + else + MostTopTmp = Tmp; + TopTmp = Tmp; + BaseTy = BaseTy->getPointeeType(); + } + llvm::Type *Ty = BaseLVType; + if (Tmp.isValid()) + Ty = Tmp.getElementType(); + Addr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, Ty); + if (Tmp.isValid()) { + CGF.Builder.CreateStore(Addr, Tmp); + return MostTopTmp; + } + return Address(Addr, BaseLVAlignment); +} + +Address ReductionCodeGen::adjustPrivateAddress(CodeGenFunction &CGF, unsigned N, + Address PrivateAddr) { + const DeclRefExpr *DE; + const VarDecl *OrigVD = nullptr; + if (auto *OASE = dyn_cast<OMPArraySectionExpr>(ClausesData[N].Ref)) { + auto *Base = OASE->getBase()->IgnoreParenImpCasts(); + while (auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base)) + Base = TempOASE->getBase()->IgnoreParenImpCasts(); + while (auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) + Base = TempASE->getBase()->IgnoreParenImpCasts(); + DE = cast<DeclRefExpr>(Base); + OrigVD = cast<VarDecl>(DE->getDecl()); + } else if (auto *ASE = dyn_cast<ArraySubscriptExpr>(ClausesData[N].Ref)) { + auto *Base = ASE->getBase()->IgnoreParenImpCasts(); + while (auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) + Base = TempASE->getBase()->IgnoreParenImpCasts(); + DE = cast<DeclRefExpr>(Base); + OrigVD = cast<VarDecl>(DE->getDecl()); + } + if (OrigVD) { + BaseDecls.emplace_back(OrigVD); + auto OriginalBaseLValue = CGF.EmitLValue(DE); + LValue BaseLValue = + loadToBegin(CGF, OrigVD->getType(), SharedAddresses[N].first.getType(), + OriginalBaseLValue); + llvm::Value *Adjustment = CGF.Builder.CreatePtrDiff( + BaseLValue.getPointer(), SharedAddresses[N].first.getPointer()); + llvm::Value *Ptr = + CGF.Builder.CreateGEP(PrivateAddr.getPointer(), Adjustment); + return castToBase(CGF, OrigVD->getType(), + SharedAddresses[N].first.getType(), + OriginalBaseLValue.getPointer()->getType(), + OriginalBaseLValue.getAlignment(), Ptr); + } + BaseDecls.emplace_back( + cast<VarDecl>(cast<DeclRefExpr>(ClausesData[N].Ref)->getDecl())); + return PrivateAddr; +} + +bool ReductionCodeGen::usesReductionInitializer(unsigned N) const { + auto *DRD = getReductionInit(ClausesData[N].ReductionOp); + return DRD && DRD->getInitializer(); +} + LValue CGOpenMPRegionInfo::getThreadIDVariableLValue(CodeGenFunction &CGF) { return CGF.EmitLoadOfPointerLValue( CGF.GetAddrOfLocalVar(getThreadIDVariable()), @@ -720,7 +1134,7 @@ LValue CGOpenMPTaskOutlinedRegionInfo::getThreadIDVariableLValue( CodeGenFunction &CGF) { return CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(getThreadIDVariable()), getThreadIDVariable()->getType(), - AlignmentSource::Decl); + LValueBaseInfo(AlignmentSource::Decl, false)); } CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM) @@ -728,7 +1142,7 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM) IdentTy = llvm::StructType::create( "ident_t", CGM.Int32Ty /* reserved_1 */, CGM.Int32Ty /* flags */, CGM.Int32Ty /* reserved_2 */, CGM.Int32Ty /* reserved_3 */, - CGM.Int8PtrTy /* psource */, nullptr); + CGM.Int8PtrTy /* psource */); KmpCriticalNameTy = llvm::ArrayType::get(CGM.Int32Ty, /*NumElements*/ 8); loadOffloadInfoMetadata(); @@ -747,9 +1161,9 @@ emitCombinerOrInitializer(CodeGenModule &CGM, QualType Ty, QualType PtrTy = C.getPointerType(Ty).withRestrict(); FunctionArgList Args; ImplicitParamDecl OmpOutParm(C, /*DC=*/nullptr, Out->getLocation(), - /*Id=*/nullptr, PtrTy); + /*Id=*/nullptr, PtrTy, ImplicitParamDecl::Other); ImplicitParamDecl OmpInParm(C, /*DC=*/nullptr, In->getLocation(), - /*Id=*/nullptr, PtrTy); + /*Id=*/nullptr, PtrTy, ImplicitParamDecl::Other); Args.push_back(&OmpOutParm); Args.push_back(&OmpInParm); auto &FnInfo = @@ -760,6 +1174,7 @@ emitCombinerOrInitializer(CodeGenModule &CGM, QualType Ty, IsCombiner ? ".omp_combiner." : ".omp_initializer.", &CGM.getModule()); CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, FnInfo); Fn->removeFnAttr(llvm::Attribute::NoInline); + Fn->removeFnAttr(llvm::Attribute::OptimizeNone); Fn->addFnAttr(llvm::Attribute::AlwaysInline); CodeGenFunction CGF(CGM); // Map "T omp_in;" variable to "*omp_in_parm" value in all expressions. @@ -842,12 +1257,12 @@ static Address createIdentFieldGEP(CodeGenFunction &CGF, Address Addr, return CGF.Builder.CreateStructGEP(Addr, Field, Offset, Name); } -llvm::Value *CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction( - const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, - OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { +static llvm::Value *emitParallelOrTeamsOutlinedFunction( + CodeGenModule &CGM, const OMPExecutableDirective &D, const CapturedStmt *CS, + const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, + const StringRef OutlinedHelperName, const RegionCodeGenTy &CodeGen) { assert(ThreadIDVar->getType()->isPointerType() && "thread id variable must be of type kmp_int32 *"); - const CapturedStmt *CS = cast<CapturedStmt>(D.getAssociatedStmt()); CodeGenFunction CGF(CGM, true); bool HasCancel = false; if (auto *OPD = dyn_cast<OMPParallelDirective>(&D)) @@ -857,11 +1272,27 @@ llvm::Value *CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction( else if (auto *OPFD = dyn_cast<OMPParallelForDirective>(&D)) HasCancel = OPFD->hasCancel(); CGOpenMPOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen, InnermostKind, - HasCancel, getOutlinedHelperName()); + HasCancel, OutlinedHelperName); CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo); return CGF.GenerateOpenMPCapturedStmtFunction(*CS); } +llvm::Value *CGOpenMPRuntime::emitParallelOutlinedFunction( + const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, + OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { + const CapturedStmt *CS = D.getCapturedStmt(OMPD_parallel); + return emitParallelOrTeamsOutlinedFunction( + CGM, D, CS, ThreadIDVar, InnermostKind, getOutlinedHelperName(), CodeGen); +} + +llvm::Value *CGOpenMPRuntime::emitTeamsOutlinedFunction( + const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, + OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { + const CapturedStmt *CS = D.getCapturedStmt(OMPD_teams); + return emitParallelOrTeamsOutlinedFunction( + CGM, D, CS, ThreadIDVar, InnermostKind, getOutlinedHelperName(), CodeGen); +} + llvm::Value *CGOpenMPRuntime::emitTaskOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, const VarDecl *PartIDVar, const VarDecl *TaskTVar, @@ -1537,6 +1968,26 @@ CGOpenMPRuntime::createRuntimeFunction(unsigned Function) { RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_doacross_wait"); break; } + case OMPRTL__kmpc_task_reduction_init: { + // Build void *__kmpc_task_reduction_init(int gtid, int num_data, void + // *data); + llvm::Type *TypeParams[] = {CGM.IntTy, CGM.IntTy, CGM.VoidPtrTy}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false); + RTLFn = + CGM.CreateRuntimeFunction(FnTy, /*Name=*/"__kmpc_task_reduction_init"); + break; + } + case OMPRTL__kmpc_task_reduction_get_th_data: { + // Build void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void + // *d); + llvm::Type *TypeParams[] = {CGM.IntTy, CGM.VoidPtrTy, CGM.VoidPtrTy}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false); + RTLFn = CGM.CreateRuntimeFunction( + FnTy, /*Name=*/"__kmpc_task_reduction_get_th_data"); + break; + } case OMPRTL__tgt_target: { // Build int32_t __tgt_target(int32_t device_id, void *host_ptr, int32_t // arg_num, void** args_base, void **args, size_t *arg_sizes, int32_t @@ -1791,8 +2242,8 @@ llvm::Function *CGOpenMPRuntime::emitThreadPrivateVarDefinition( // threadprivate copy of the variable VD CodeGenFunction CtorCGF(CGM); FunctionArgList Args; - ImplicitParamDecl Dst(CGM.getContext(), /*DC=*/nullptr, SourceLocation(), - /*Id=*/nullptr, CGM.getContext().VoidPtrTy); + ImplicitParamDecl Dst(CGM.getContext(), CGM.getContext().VoidPtrTy, + ImplicitParamDecl::Other); Args.push_back(&Dst); auto &FI = CGM.getTypes().arrangeBuiltinFunctionDeclaration( @@ -1822,8 +2273,8 @@ llvm::Function *CGOpenMPRuntime::emitThreadPrivateVarDefinition( // of the variable VD CodeGenFunction DtorCGF(CGM); FunctionArgList Args; - ImplicitParamDecl Dst(CGM.getContext(), /*DC=*/nullptr, SourceLocation(), - /*Id=*/nullptr, CGM.getContext().VoidPtrTy); + ImplicitParamDecl Dst(CGM.getContext(), CGM.getContext().VoidPtrTy, + ImplicitParamDecl::Other); Args.push_back(&Dst); auto &FI = CGM.getTypes().arrangeBuiltinFunctionDeclaration( @@ -1887,6 +2338,27 @@ llvm::Function *CGOpenMPRuntime::emitThreadPrivateVarDefinition( return nullptr; } +Address CGOpenMPRuntime::getAddrOfArtificialThreadPrivate(CodeGenFunction &CGF, + QualType VarType, + StringRef Name) { + llvm::Twine VarName(Name, ".artificial."); + llvm::Type *VarLVType = CGF.ConvertTypeForMem(VarType); + llvm::Value *GAddr = getOrCreateInternalVariable(VarLVType, VarName); + llvm::Value *Args[] = { + emitUpdateLocation(CGF, SourceLocation()), + getThreadID(CGF, SourceLocation()), + CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(GAddr, CGM.VoidPtrTy), + CGF.Builder.CreateIntCast(CGF.getTypeSize(VarType), CGM.SizeTy, + /*IsSigned=*/false), + getOrCreateInternalVariable(CGM.VoidPtrPtrTy, VarName + ".cache.")}; + return Address( + CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitRuntimeCall( + createRuntimeFunction(OMPRTL__kmpc_threadprivate_cached), Args), + VarLVType->getPointerTo(/*AddrSpace=*/0)), + CGM.getPointerAlign()); +} + /// \brief Emits code for OpenMP 'if' clause using specified \a CodeGen /// function. Here is the logic: /// if (Cond) { @@ -2174,10 +2646,8 @@ static llvm::Value *emitCopyprivateCopyFunction( auto &C = CGM.getContext(); // void copy_func(void *LHSArg, void *RHSArg); FunctionArgList Args; - ImplicitParamDecl LHSArg(C, /*DC=*/nullptr, SourceLocation(), /*Id=*/nullptr, - C.VoidPtrTy); - ImplicitParamDecl RHSArg(C, /*DC=*/nullptr, SourceLocation(), /*Id=*/nullptr, - C.VoidPtrTy); + ImplicitParamDecl LHSArg(C, C.VoidPtrTy, ImplicitParamDecl::Other); + ImplicitParamDecl RHSArg(C, C.VoidPtrTy, ImplicitParamDecl::Other); Args.push_back(&LHSArg); Args.push_back(&RHSArg); auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); @@ -2450,16 +2920,14 @@ static int addMonoNonMonoModifier(OpenMPSchedType Schedule, return Schedule | Modifier; } -void CGOpenMPRuntime::emitForDispatchInit(CodeGenFunction &CGF, - SourceLocation Loc, - const OpenMPScheduleTy &ScheduleKind, - unsigned IVSize, bool IVSigned, - bool Ordered, llvm::Value *UB, - llvm::Value *Chunk) { +void CGOpenMPRuntime::emitForDispatchInit( + CodeGenFunction &CGF, SourceLocation Loc, + const OpenMPScheduleTy &ScheduleKind, unsigned IVSize, bool IVSigned, + bool Ordered, const DispatchRTInput &DispatchValues) { if (!CGF.HaveInsertPoint()) return; - OpenMPSchedType Schedule = - getRuntimeSchedule(ScheduleKind.Schedule, Chunk != nullptr, Ordered); + OpenMPSchedType Schedule = getRuntimeSchedule( + ScheduleKind.Schedule, DispatchValues.Chunk != nullptr, Ordered); assert(Ordered || (Schedule != OMP_sch_static && Schedule != OMP_sch_static_chunked && Schedule != OMP_ord_static && Schedule != OMP_ord_static_chunked && @@ -2470,14 +2938,14 @@ void CGOpenMPRuntime::emitForDispatchInit(CodeGenFunction &CGF, // kmp_int[32|64] stride, kmp_int[32|64] chunk); // If the Chunk was not specified in the clause - use default value 1. - if (Chunk == nullptr) - Chunk = CGF.Builder.getIntN(IVSize, 1); + llvm::Value *Chunk = DispatchValues.Chunk ? DispatchValues.Chunk + : CGF.Builder.getIntN(IVSize, 1); llvm::Value *Args[] = { emitUpdateLocation(CGF, Loc), getThreadID(CGF, Loc), CGF.Builder.getInt32(addMonoNonMonoModifier( Schedule, ScheduleKind.M1, ScheduleKind.M2)), // Schedule type - CGF.Builder.getIntN(IVSize, 0), // Lower - UB, // Upper + DispatchValues.LB, // Lower + DispatchValues.UB, // Upper CGF.Builder.getIntN(IVSize, 1), // Stride Chunk // Chunk }; @@ -2686,6 +3154,8 @@ enum KmpTaskTFields { KmpTaskTStride, /// (Taskloops only) Is last iteration flag. KmpTaskTLastIter, + /// (Taskloops only) Reduction data. + KmpTaskTReductions, }; } // anonymous namespace @@ -2770,8 +3240,7 @@ createOffloadingBinaryDescriptorFunction(CodeGenModule &CGM, StringRef Name, const RegionCodeGenTy &Codegen) { auto &C = CGM.getContext(); FunctionArgList Args; - ImplicitParamDecl DummyPtr(C, /*DC=*/nullptr, SourceLocation(), - /*Id=*/nullptr, C.VoidPtrTy); + ImplicitParamDecl DummyPtr(C, C.VoidPtrTy, ImplicitParamDecl::Other); Args.push_back(&DummyPtr); CodeGenFunction CGF(CGM); @@ -2874,7 +3343,7 @@ CGOpenMPRuntime::createOffloadingBinaryDescriptorRegistration() { // descriptor, so we can reuse the logic that emits Ctors and Dtors. auto *IdentInfo = &C.Idents.get(".omp_offloading.reg_unreg_var"); ImplicitParamDecl RegUnregVar(C, C.getTranslationUnitDecl(), SourceLocation(), - IdentInfo, C.CharTy); + IdentInfo, C.CharTy, ImplicitParamDecl::Other); auto *UnRegFn = createOffloadingBinaryDescriptorFunction( CGM, ".omp_offloading.descriptor_unreg", @@ -2889,6 +3358,19 @@ CGOpenMPRuntime::createOffloadingBinaryDescriptorRegistration() { Desc); CGM.getCXXABI().registerGlobalDtor(CGF, RegUnregVar, UnRegFn, Desc); }); + if (CGM.supportsCOMDAT()) { + // It is sufficient to call registration function only once, so create a + // COMDAT group for registration/unregistration functions and associated + // data. That would reduce startup time and code size. Registration + // function serves as a COMDAT group key. + auto ComdatKey = M.getOrInsertComdat(RegFn->getName()); + RegFn->setLinkage(llvm::GlobalValue::LinkOnceAnyLinkage); + RegFn->setVisibility(llvm::GlobalValue::HiddenVisibility); + RegFn->setComdat(ComdatKey); + UnRegFn->setComdat(ComdatKey); + DeviceImages->setComdat(ComdatKey); + Desc->setComdat(ComdatKey); + } return RegFn; } @@ -2958,7 +3440,7 @@ void CGOpenMPRuntime::createOffloadEntriesAndInfoMetadata() { // Create the offloading info metadata node. llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info"); - // Auxiliar methods to create metadata values and strings. + // Auxiliary methods to create metadata values and strings. auto getMDInt = [&](unsigned v) { return llvm::ConstantAsMetadata::get( llvm::ConstantInt::get(llvm::Type::getInt32Ty(C), v)); @@ -3225,6 +3707,7 @@ createKmpTaskTRecordDecl(CodeGenModule &CGM, OpenMPDirectiveKind Kind, // kmp_uint64 ub; // kmp_int64 st; // kmp_int32 liter; + // void * reductions; // }; auto *UD = C.buildImplicitRecord("kmp_cmplrdata_t", TTK_Union); UD->startDefinition(); @@ -3248,6 +3731,7 @@ createKmpTaskTRecordDecl(CodeGenModule &CGM, OpenMPDirectiveKind Kind, addFieldToRecordDecl(C, RD, KmpUInt64Ty); addFieldToRecordDecl(C, RD, KmpInt64Ty); addFieldToRecordDecl(C, RD, KmpInt32Ty); + addFieldToRecordDecl(C, RD, C.VoidPtrTy); } RD->completeDefinition(); return RD; @@ -3278,7 +3762,7 @@ createKmpTaskTWithPrivatesRecordDecl(CodeGenModule &CGM, QualType KmpTaskTQTy, /// TaskFunction(gtid, tt->part_id, &tt->privates, task_privates_map, tt, /// For taskloops: /// tt->task_data.lb, tt->task_data.ub, tt->task_data.st, tt->task_data.liter, -/// tt->shareds); +/// tt->reductions, tt->shareds); /// return 0; /// } /// \endcode @@ -3291,10 +3775,11 @@ emitProxyTaskFunction(CodeGenModule &CGM, SourceLocation Loc, llvm::Value *TaskPrivatesMap) { auto &C = CGM.getContext(); FunctionArgList Args; - ImplicitParamDecl GtidArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, KmpInt32Ty); - ImplicitParamDecl TaskTypeArg(C, /*DC=*/nullptr, Loc, - /*Id=*/nullptr, - KmpTaskTWithPrivatesPtrQTy.withRestrict()); + ImplicitParamDecl GtidArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, KmpInt32Ty, + ImplicitParamDecl::Other); + ImplicitParamDecl TaskTypeArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + KmpTaskTWithPrivatesPtrQTy.withRestrict(), + ImplicitParamDecl::Other); Args.push_back(&GtidArg); Args.push_back(&TaskTypeArg); auto &TaskEntryFnInfo = @@ -3363,10 +3848,14 @@ emitProxyTaskFunction(CodeGenModule &CGM, SourceLocation Loc, auto LIFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTLastIter); auto LILVal = CGF.EmitLValueForField(Base, *LIFI); auto *LIParam = CGF.EmitLoadOfLValue(LILVal, Loc).getScalarVal(); + auto RFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTReductions); + auto RLVal = CGF.EmitLValueForField(Base, *RFI); + auto *RParam = CGF.EmitLoadOfLValue(RLVal, Loc).getScalarVal(); CallArgs.push_back(LBParam); CallArgs.push_back(UBParam); CallArgs.push_back(StParam); CallArgs.push_back(LIParam); + CallArgs.push_back(RParam); } CallArgs.push_back(SharedsParam); @@ -3385,10 +3874,11 @@ static llvm::Value *emitDestructorsFunction(CodeGenModule &CGM, QualType KmpTaskTWithPrivatesQTy) { auto &C = CGM.getContext(); FunctionArgList Args; - ImplicitParamDecl GtidArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, KmpInt32Ty); - ImplicitParamDecl TaskTypeArg(C, /*DC=*/nullptr, Loc, - /*Id=*/nullptr, - KmpTaskTWithPrivatesPtrQTy.withRestrict()); + ImplicitParamDecl GtidArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, KmpInt32Ty, + ImplicitParamDecl::Other); + ImplicitParamDecl TaskTypeArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + KmpTaskTWithPrivatesPtrQTy.withRestrict(), + ImplicitParamDecl::Other); Args.push_back(&GtidArg); Args.push_back(&TaskTypeArg); FunctionType::ExtInfo Info; @@ -3444,36 +3934,40 @@ emitTaskPrivateMappingFunction(CodeGenModule &CGM, SourceLocation Loc, FunctionArgList Args; ImplicitParamDecl TaskPrivatesArg( C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.getPointerType(PrivatesQTy).withConst().withRestrict()); + C.getPointerType(PrivatesQTy).withConst().withRestrict(), + ImplicitParamDecl::Other); Args.push_back(&TaskPrivatesArg); llvm::DenseMap<const VarDecl *, unsigned> PrivateVarsPos; unsigned Counter = 1; for (auto *E: PrivateVars) { Args.push_back(ImplicitParamDecl::Create( - C, /*DC=*/nullptr, Loc, - /*Id=*/nullptr, C.getPointerType(C.getPointerType(E->getType())) - .withConst() - .withRestrict())); + C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + C.getPointerType(C.getPointerType(E->getType())) + .withConst() + .withRestrict(), + ImplicitParamDecl::Other)); auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl()); PrivateVarsPos[VD] = Counter; ++Counter; } for (auto *E : FirstprivateVars) { Args.push_back(ImplicitParamDecl::Create( - C, /*DC=*/nullptr, Loc, - /*Id=*/nullptr, C.getPointerType(C.getPointerType(E->getType())) - .withConst() - .withRestrict())); + C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + C.getPointerType(C.getPointerType(E->getType())) + .withConst() + .withRestrict(), + ImplicitParamDecl::Other)); auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl()); PrivateVarsPos[VD] = Counter; ++Counter; } for (auto *E: LastprivateVars) { Args.push_back(ImplicitParamDecl::Create( - C, /*DC=*/nullptr, Loc, - /*Id=*/nullptr, C.getPointerType(C.getPointerType(E->getType())) - .withConst() - .withRestrict())); + C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + C.getPointerType(C.getPointerType(E->getType())) + .withConst() + .withRestrict(), + ImplicitParamDecl::Other)); auto *VD = cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl()); PrivateVarsPos[VD] = Counter; ++Counter; @@ -3488,6 +3982,7 @@ emitTaskPrivateMappingFunction(CodeGenModule &CGM, SourceLocation Loc, CGM.SetInternalFunctionAttributes(/*D=*/nullptr, TaskPrivatesMap, TaskPrivatesMapFnInfo); TaskPrivatesMap->removeFnAttr(llvm::Attribute::NoInline); + TaskPrivatesMap->removeFnAttr(llvm::Attribute::OptimizeNone); TaskPrivatesMap->addFnAttr(llvm::Attribute::AlwaysInline); CodeGenFunction CGF(CGM); CGF.disableDebugInfo(); @@ -3551,7 +4046,9 @@ static void emitPrivatesInit(CodeGenFunction &CGF, auto SharedRefLValue = CGF.EmitLValueForField(SrcBase, SharedField); SharedRefLValue = CGF.MakeAddrLValue( Address(SharedRefLValue.getPointer(), C.getDeclAlign(OriginalVD)), - SharedRefLValue.getType(), AlignmentSource::Decl); + SharedRefLValue.getType(), + LValueBaseInfo(AlignmentSource::Decl, + SharedRefLValue.getBaseInfo().getMayAlias())); QualType Type = OriginalVD->getType(); if (Type->isArrayType()) { // Initialize firstprivate array. @@ -3561,7 +4058,7 @@ static void emitPrivatesInit(CodeGenFunction &CGF, SharedRefLValue.getAddress(), Type); } else { // Initialize firstprivate array using element-by-element - // intialization. + // initialization. CGF.EmitOMPAggregateAssign( PrivateLValue.getAddress(), SharedRefLValue.getAddress(), Type, [&CGF, Elem, Init, &CapturesInfo](Address DestElement, @@ -3630,12 +4127,14 @@ emitTaskDupFunction(CodeGenModule &CGM, SourceLocation Loc, ArrayRef<PrivateDataTy> Privates, bool WithLastIter) { auto &C = CGM.getContext(); FunctionArgList Args; - ImplicitParamDecl DstArg(C, /*DC=*/nullptr, Loc, - /*Id=*/nullptr, KmpTaskTWithPrivatesPtrQTy); - ImplicitParamDecl SrcArg(C, /*DC=*/nullptr, Loc, - /*Id=*/nullptr, KmpTaskTWithPrivatesPtrQTy); - ImplicitParamDecl LastprivArg(C, /*DC=*/nullptr, Loc, - /*Id=*/nullptr, C.IntTy); + ImplicitParamDecl DstArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + KmpTaskTWithPrivatesPtrQTy, + ImplicitParamDecl::Other); + ImplicitParamDecl SrcArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + KmpTaskTWithPrivatesPtrQTy, + ImplicitParamDecl::Other); + ImplicitParamDecl LastprivArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, + ImplicitParamDecl::Other); Args.push_back(&DstArg); Args.push_back(&SrcArg); Args.push_back(&LastprivArg); @@ -3764,9 +4263,7 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc, // Emit initial values for private copies (if any). llvm::Value *TaskPrivatesMap = nullptr; auto *TaskPrivatesMapTy = - std::next(cast<llvm::Function>(TaskFunction)->getArgumentList().begin(), - 3) - ->getType(); + std::next(cast<llvm::Function>(TaskFunction)->arg_begin(), 3)->getType(); if (!Privates.empty()) { auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin()); TaskPrivatesMap = emitTaskPrivateMappingFunction( @@ -4006,8 +4503,8 @@ void CGOpenMPRuntime::emitTaskCall(CodeGenFunction &CGF, SourceLocation Loc, DepTaskArgs[5] = CGF.Builder.getInt32(0); DepTaskArgs[6] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy); } - auto &&ThenCodeGen = [this, Loc, &Data, TDBase, KmpTaskTQTyRD, - NumDependencies, &TaskArgs, + auto &&ThenCodeGen = [this, &Data, TDBase, KmpTaskTQTyRD, NumDependencies, + &TaskArgs, &DepTaskArgs](CodeGenFunction &CGF, PrePostActionTy &) { if (!Data.Tied) { auto PartIdFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTPartId); @@ -4121,11 +4618,27 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc, cast<VarDecl>(cast<DeclRefExpr>(D.getStrideVariable())->getDecl()); CGF.EmitAnyExprToMem(StVar->getInit(), StLVal.getAddress(), StLVal.getQuals(), /*IsInitializer=*/true); + // Store reductions address. + LValue RedLVal = CGF.EmitLValueForField( + Result.TDBase, + *std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTReductions)); + if (Data.Reductions) + CGF.EmitStoreOfScalar(Data.Reductions, RedLVal); + else { + CGF.EmitNullInitialization(RedLVal.getAddress(), + CGF.getContext().VoidPtrTy); + } enum { NoSchedule = 0, Grainsize = 1, NumTasks = 2 }; llvm::Value *TaskArgs[] = { - UpLoc, ThreadID, Result.NewTask, IfVal, LBLVal.getPointer(), - UBLVal.getPointer(), CGF.EmitLoadOfScalar(StLVal, SourceLocation()), - llvm::ConstantInt::getSigned(CGF.IntTy, Data.Nogroup ? 1 : 0), + UpLoc, + ThreadID, + Result.NewTask, + IfVal, + LBLVal.getPointer(), + UBLVal.getPointer(), + CGF.EmitLoadOfScalar(StLVal, SourceLocation()), + llvm::ConstantInt::getNullValue( + CGF.IntTy), // Always 0 because taskgroup emitted by the compiler llvm::ConstantInt::getSigned( CGF.IntTy, Data.Schedule.getPointer() ? Data.Schedule.getInt() ? NumTasks : Grainsize @@ -4134,10 +4647,9 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc, ? CGF.Builder.CreateIntCast(Data.Schedule.getPointer(), CGF.Int64Ty, /*isSigned=*/false) : llvm::ConstantInt::get(CGF.Int64Ty, /*V=*/0), - Result.TaskDupFn - ? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(Result.TaskDupFn, - CGF.VoidPtrTy) - : llvm::ConstantPointerNull::get(CGF.VoidPtrTy)}; + Result.TaskDupFn ? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + Result.TaskDupFn, CGF.VoidPtrTy) + : llvm::ConstantPointerNull::get(CGF.VoidPtrTy)}; CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_taskloop), TaskArgs); } @@ -4241,20 +4753,16 @@ static void emitReductionCombiner(CodeGenFunction &CGF, CGF.EmitIgnoredExpr(ReductionOp); } -static llvm::Value *emitReductionFunction(CodeGenModule &CGM, - llvm::Type *ArgsType, - ArrayRef<const Expr *> Privates, - ArrayRef<const Expr *> LHSExprs, - ArrayRef<const Expr *> RHSExprs, - ArrayRef<const Expr *> ReductionOps) { +llvm::Value *CGOpenMPRuntime::emitReductionFunction( + CodeGenModule &CGM, llvm::Type *ArgsType, ArrayRef<const Expr *> Privates, + ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs, + ArrayRef<const Expr *> ReductionOps) { auto &C = CGM.getContext(); // void reduction_func(void *LHSArg, void *RHSArg); FunctionArgList Args; - ImplicitParamDecl LHSArg(C, /*DC=*/nullptr, SourceLocation(), /*Id=*/nullptr, - C.VoidPtrTy); - ImplicitParamDecl RHSArg(C, /*DC=*/nullptr, SourceLocation(), /*Id=*/nullptr, - C.VoidPtrTy); + ImplicitParamDecl LHSArg(C, C.VoidPtrTy, ImplicitParamDecl::Other); + ImplicitParamDecl RHSArg(C, C.VoidPtrTy, ImplicitParamDecl::Other); Args.push_back(&LHSArg); Args.push_back(&RHSArg); auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); @@ -4329,11 +4837,11 @@ static llvm::Value *emitReductionFunction(CodeGenModule &CGM, return Fn; } -static void emitSingleReductionCombiner(CodeGenFunction &CGF, - const Expr *ReductionOp, - const Expr *PrivateRef, - const DeclRefExpr *LHS, - const DeclRefExpr *RHS) { +void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF, + const Expr *ReductionOp, + const Expr *PrivateRef, + const DeclRefExpr *LHS, + const DeclRefExpr *RHS) { if (PrivateRef->getType()->isArrayType()) { // Emit reduction for array section. auto *LHSVar = cast<VarDecl>(LHS->getDecl()); @@ -4353,9 +4861,13 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs, ArrayRef<const Expr *> ReductionOps, - bool WithNowait, bool SimpleReduction) { + ReductionOptionsTy Options) { if (!CGF.HaveInsertPoint()) return; + + bool WithNowait = Options.WithNowait; + bool SimpleReduction = Options.SimpleReduction; + // Next code should be emitted for reduction: // // static kmp_critical_name lock = { 0 }; @@ -4497,12 +5009,13 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, }; auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps]( CodeGenFunction &CGF, PrePostActionTy &Action) { + auto &RT = CGF.CGM.getOpenMPRuntime(); auto IPriv = Privates.begin(); auto ILHS = LHSExprs.begin(); auto IRHS = RHSExprs.begin(); for (auto *E : ReductionOps) { - emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS), - cast<DeclRefExpr>(*IRHS)); + RT.emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS), + cast<DeclRefExpr>(*IRHS)); ++IPriv; ++ILHS; ++IRHS; @@ -4562,7 +5075,7 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, } if (XExpr) { auto *VD = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl()); - auto &&AtomicRedGen = [BO, VD, IPriv, + auto &&AtomicRedGen = [BO, VD, Loc](CodeGenFunction &CGF, const Expr *XExpr, const Expr *EExpr, const Expr *UpExpr) { LValue X = CGF.EmitLValue(XExpr); @@ -4572,7 +5085,7 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, CGF.EmitOMPAtomicSimpleUpdateExpr( X, E, BO, /*IsXLHSInRHSPart=*/true, llvm::AtomicOrdering::Monotonic, Loc, - [&CGF, UpExpr, VD, IPriv, Loc](RValue XRValue) { + [&CGF, UpExpr, VD, Loc](RValue XRValue) { CodeGenFunction::OMPPrivateScope PrivateScope(CGF); PrivateScope.addPrivate( VD, [&CGF, VD, XRValue, Loc]() -> Address { @@ -4640,6 +5153,353 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, CGF.EmitBlock(DefaultBB, /*IsFinished=*/true); } +/// Generates unique name for artificial threadprivate variables. +/// Format is: <Prefix> "." <Loc_raw_encoding> "_" <N> +static std::string generateUniqueName(StringRef Prefix, SourceLocation Loc, + unsigned N) { + SmallString<256> Buffer; + llvm::raw_svector_ostream Out(Buffer); + Out << Prefix << "." << Loc.getRawEncoding() << "_" << N; + return Out.str(); +} + +/// Emits reduction initializer function: +/// \code +/// void @.red_init(void* %arg) { +/// %0 = bitcast void* %arg to <type>* +/// store <type> <init>, <type>* %0 +/// ret void +/// } +/// \endcode +static llvm::Value *emitReduceInitFunction(CodeGenModule &CGM, + SourceLocation Loc, + ReductionCodeGen &RCG, unsigned N) { + auto &C = CGM.getContext(); + FunctionArgList Args; + ImplicitParamDecl Param(C, C.VoidPtrTy, ImplicitParamDecl::Other); + Args.emplace_back(&Param); + auto &FnInfo = + CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); + auto *FnTy = CGM.getTypes().GetFunctionType(FnInfo); + auto *Fn = llvm::Function::Create(FnTy, llvm::GlobalValue::InternalLinkage, + ".red_init.", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, FnInfo); + CodeGenFunction CGF(CGM); + CGF.disableDebugInfo(); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args); + Address PrivateAddr = CGF.EmitLoadOfPointer( + CGF.GetAddrOfLocalVar(&Param), + C.getPointerType(C.VoidPtrTy).castAs<PointerType>()); + llvm::Value *Size = nullptr; + // If the size of the reduction item is non-constant, load it from global + // threadprivate variable. + if (RCG.getSizes(N).second) { + Address SizeAddr = CGM.getOpenMPRuntime().getAddrOfArtificialThreadPrivate( + CGF, CGM.getContext().getSizeType(), + generateUniqueName("reduction_size", Loc, N)); + Size = + CGF.EmitLoadOfScalar(SizeAddr, /*Volatile=*/false, + CGM.getContext().getSizeType(), SourceLocation()); + } + RCG.emitAggregateType(CGF, N, Size); + LValue SharedLVal; + // If initializer uses initializer from declare reduction construct, emit a + // pointer to the address of the original reduction item (reuired by reduction + // initializer) + if (RCG.usesReductionInitializer(N)) { + Address SharedAddr = + CGM.getOpenMPRuntime().getAddrOfArtificialThreadPrivate( + CGF, CGM.getContext().VoidPtrTy, + generateUniqueName("reduction", Loc, N)); + SharedLVal = CGF.MakeAddrLValue(SharedAddr, CGM.getContext().VoidPtrTy); + } else { + SharedLVal = CGF.MakeNaturalAlignAddrLValue( + llvm::ConstantPointerNull::get(CGM.VoidPtrTy), + CGM.getContext().VoidPtrTy); + } + // Emit the initializer: + // %0 = bitcast void* %arg to <type>* + // store <type> <init>, <type>* %0 + RCG.emitInitialization(CGF, N, PrivateAddr, SharedLVal, + [](CodeGenFunction &) { return false; }); + CGF.FinishFunction(); + return Fn; +} + +/// Emits reduction combiner function: +/// \code +/// void @.red_comb(void* %arg0, void* %arg1) { +/// %lhs = bitcast void* %arg0 to <type>* +/// %rhs = bitcast void* %arg1 to <type>* +/// %2 = <ReductionOp>(<type>* %lhs, <type>* %rhs) +/// store <type> %2, <type>* %lhs +/// ret void +/// } +/// \endcode +static llvm::Value *emitReduceCombFunction(CodeGenModule &CGM, + SourceLocation Loc, + ReductionCodeGen &RCG, unsigned N, + const Expr *ReductionOp, + const Expr *LHS, const Expr *RHS, + const Expr *PrivateRef) { + auto &C = CGM.getContext(); + auto *LHSVD = cast<VarDecl>(cast<DeclRefExpr>(LHS)->getDecl()); + auto *RHSVD = cast<VarDecl>(cast<DeclRefExpr>(RHS)->getDecl()); + FunctionArgList Args; + ImplicitParamDecl ParamInOut(C, C.VoidPtrTy, ImplicitParamDecl::Other); + ImplicitParamDecl ParamIn(C, C.VoidPtrTy, ImplicitParamDecl::Other); + Args.emplace_back(&ParamInOut); + Args.emplace_back(&ParamIn); + auto &FnInfo = + CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); + auto *FnTy = CGM.getTypes().GetFunctionType(FnInfo); + auto *Fn = llvm::Function::Create(FnTy, llvm::GlobalValue::InternalLinkage, + ".red_comb.", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, FnInfo); + CodeGenFunction CGF(CGM); + CGF.disableDebugInfo(); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args); + llvm::Value *Size = nullptr; + // If the size of the reduction item is non-constant, load it from global + // threadprivate variable. + if (RCG.getSizes(N).second) { + Address SizeAddr = CGM.getOpenMPRuntime().getAddrOfArtificialThreadPrivate( + CGF, CGM.getContext().getSizeType(), + generateUniqueName("reduction_size", Loc, N)); + Size = + CGF.EmitLoadOfScalar(SizeAddr, /*Volatile=*/false, + CGM.getContext().getSizeType(), SourceLocation()); + } + RCG.emitAggregateType(CGF, N, Size); + // Remap lhs and rhs variables to the addresses of the function arguments. + // %lhs = bitcast void* %arg0 to <type>* + // %rhs = bitcast void* %arg1 to <type>* + CodeGenFunction::OMPPrivateScope PrivateScope(CGF); + PrivateScope.addPrivate(LHSVD, [&C, &CGF, &ParamInOut, LHSVD]() -> Address { + // Pull out the pointer to the variable. + Address PtrAddr = CGF.EmitLoadOfPointer( + CGF.GetAddrOfLocalVar(&ParamInOut), + C.getPointerType(C.VoidPtrTy).castAs<PointerType>()); + return CGF.Builder.CreateElementBitCast( + PtrAddr, CGF.ConvertTypeForMem(LHSVD->getType())); + }); + PrivateScope.addPrivate(RHSVD, [&C, &CGF, &ParamIn, RHSVD]() -> Address { + // Pull out the pointer to the variable. + Address PtrAddr = CGF.EmitLoadOfPointer( + CGF.GetAddrOfLocalVar(&ParamIn), + C.getPointerType(C.VoidPtrTy).castAs<PointerType>()); + return CGF.Builder.CreateElementBitCast( + PtrAddr, CGF.ConvertTypeForMem(RHSVD->getType())); + }); + PrivateScope.Privatize(); + // Emit the combiner body: + // %2 = <ReductionOp>(<type> *%lhs, <type> *%rhs) + // store <type> %2, <type>* %lhs + CGM.getOpenMPRuntime().emitSingleReductionCombiner( + CGF, ReductionOp, PrivateRef, cast<DeclRefExpr>(LHS), + cast<DeclRefExpr>(RHS)); + CGF.FinishFunction(); + return Fn; +} + +/// Emits reduction finalizer function: +/// \code +/// void @.red_fini(void* %arg) { +/// %0 = bitcast void* %arg to <type>* +/// <destroy>(<type>* %0) +/// ret void +/// } +/// \endcode +static llvm::Value *emitReduceFiniFunction(CodeGenModule &CGM, + SourceLocation Loc, + ReductionCodeGen &RCG, unsigned N) { + if (!RCG.needCleanups(N)) + return nullptr; + auto &C = CGM.getContext(); + FunctionArgList Args; + ImplicitParamDecl Param(C, C.VoidPtrTy, ImplicitParamDecl::Other); + Args.emplace_back(&Param); + auto &FnInfo = + CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); + auto *FnTy = CGM.getTypes().GetFunctionType(FnInfo); + auto *Fn = llvm::Function::Create(FnTy, llvm::GlobalValue::InternalLinkage, + ".red_fini.", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, FnInfo); + CodeGenFunction CGF(CGM); + CGF.disableDebugInfo(); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args); + Address PrivateAddr = CGF.EmitLoadOfPointer( + CGF.GetAddrOfLocalVar(&Param), + C.getPointerType(C.VoidPtrTy).castAs<PointerType>()); + llvm::Value *Size = nullptr; + // If the size of the reduction item is non-constant, load it from global + // threadprivate variable. + if (RCG.getSizes(N).second) { + Address SizeAddr = CGM.getOpenMPRuntime().getAddrOfArtificialThreadPrivate( + CGF, CGM.getContext().getSizeType(), + generateUniqueName("reduction_size", Loc, N)); + Size = + CGF.EmitLoadOfScalar(SizeAddr, /*Volatile=*/false, + CGM.getContext().getSizeType(), SourceLocation()); + } + RCG.emitAggregateType(CGF, N, Size); + // Emit the finalizer body: + // <destroy>(<type>* %0) + RCG.emitCleanups(CGF, N, PrivateAddr); + CGF.FinishFunction(); + return Fn; +} + +llvm::Value *CGOpenMPRuntime::emitTaskReductionInit( + CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> LHSExprs, + ArrayRef<const Expr *> RHSExprs, const OMPTaskDataTy &Data) { + if (!CGF.HaveInsertPoint() || Data.ReductionVars.empty()) + return nullptr; + + // Build typedef struct: + // kmp_task_red_input { + // void *reduce_shar; // shared reduction item + // size_t reduce_size; // size of data item + // void *reduce_init; // data initialization routine + // void *reduce_fini; // data finalization routine + // void *reduce_comb; // data combiner routine + // kmp_task_red_flags_t flags; // flags for additional info from compiler + // } kmp_task_red_input_t; + ASTContext &C = CGM.getContext(); + auto *RD = C.buildImplicitRecord("kmp_task_red_input_t"); + RD->startDefinition(); + const FieldDecl *SharedFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy); + const FieldDecl *SizeFD = addFieldToRecordDecl(C, RD, C.getSizeType()); + const FieldDecl *InitFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy); + const FieldDecl *FiniFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy); + const FieldDecl *CombFD = addFieldToRecordDecl(C, RD, C.VoidPtrTy); + const FieldDecl *FlagsFD = addFieldToRecordDecl( + C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false)); + RD->completeDefinition(); + QualType RDType = C.getRecordType(RD); + unsigned Size = Data.ReductionVars.size(); + llvm::APInt ArraySize(/*numBits=*/64, Size); + QualType ArrayRDType = C.getConstantArrayType( + RDType, ArraySize, ArrayType::Normal, /*IndexTypeQuals=*/0); + // kmp_task_red_input_t .rd_input.[Size]; + Address TaskRedInput = CGF.CreateMemTemp(ArrayRDType, ".rd_input."); + ReductionCodeGen RCG(Data.ReductionVars, Data.ReductionCopies, + Data.ReductionOps); + for (unsigned Cnt = 0; Cnt < Size; ++Cnt) { + // kmp_task_red_input_t &ElemLVal = .rd_input.[Cnt]; + llvm::Value *Idxs[] = {llvm::ConstantInt::get(CGM.SizeTy, /*V=*/0), + llvm::ConstantInt::get(CGM.SizeTy, Cnt)}; + llvm::Value *GEP = CGF.EmitCheckedInBoundsGEP( + TaskRedInput.getPointer(), Idxs, + /*SignedIndices=*/false, /*IsSubtraction=*/false, Loc, + ".rd_input.gep."); + LValue ElemLVal = CGF.MakeNaturalAlignAddrLValue(GEP, RDType); + // ElemLVal.reduce_shar = &Shareds[Cnt]; + LValue SharedLVal = CGF.EmitLValueForField(ElemLVal, SharedFD); + RCG.emitSharedLValue(CGF, Cnt); + llvm::Value *CastedShared = + CGF.EmitCastToVoidPtr(RCG.getSharedLValue(Cnt).getPointer()); + CGF.EmitStoreOfScalar(CastedShared, SharedLVal); + RCG.emitAggregateType(CGF, Cnt); + llvm::Value *SizeValInChars; + llvm::Value *SizeVal; + std::tie(SizeValInChars, SizeVal) = RCG.getSizes(Cnt); + // We use delayed creation/initialization for VLAs, array sections and + // custom reduction initializations. It is required because runtime does not + // provide the way to pass the sizes of VLAs/array sections to + // initializer/combiner/finalizer functions and does not pass the pointer to + // original reduction item to the initializer. Instead threadprivate global + // variables are used to store these values and use them in the functions. + bool DelayedCreation = !!SizeVal; + SizeValInChars = CGF.Builder.CreateIntCast(SizeValInChars, CGM.SizeTy, + /*isSigned=*/false); + LValue SizeLVal = CGF.EmitLValueForField(ElemLVal, SizeFD); + CGF.EmitStoreOfScalar(SizeValInChars, SizeLVal); + // ElemLVal.reduce_init = init; + LValue InitLVal = CGF.EmitLValueForField(ElemLVal, InitFD); + llvm::Value *InitAddr = + CGF.EmitCastToVoidPtr(emitReduceInitFunction(CGM, Loc, RCG, Cnt)); + CGF.EmitStoreOfScalar(InitAddr, InitLVal); + DelayedCreation = DelayedCreation || RCG.usesReductionInitializer(Cnt); + // ElemLVal.reduce_fini = fini; + LValue FiniLVal = CGF.EmitLValueForField(ElemLVal, FiniFD); + llvm::Value *Fini = emitReduceFiniFunction(CGM, Loc, RCG, Cnt); + llvm::Value *FiniAddr = Fini + ? CGF.EmitCastToVoidPtr(Fini) + : llvm::ConstantPointerNull::get(CGM.VoidPtrTy); + CGF.EmitStoreOfScalar(FiniAddr, FiniLVal); + // ElemLVal.reduce_comb = comb; + LValue CombLVal = CGF.EmitLValueForField(ElemLVal, CombFD); + llvm::Value *CombAddr = CGF.EmitCastToVoidPtr(emitReduceCombFunction( + CGM, Loc, RCG, Cnt, Data.ReductionOps[Cnt], LHSExprs[Cnt], + RHSExprs[Cnt], Data.ReductionCopies[Cnt])); + CGF.EmitStoreOfScalar(CombAddr, CombLVal); + // ElemLVal.flags = 0; + LValue FlagsLVal = CGF.EmitLValueForField(ElemLVal, FlagsFD); + if (DelayedCreation) { + CGF.EmitStoreOfScalar( + llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1, /*IsSigned=*/true), + FlagsLVal); + } else + CGF.EmitNullInitialization(FlagsLVal.getAddress(), FlagsLVal.getType()); + } + // Build call void *__kmpc_task_reduction_init(int gtid, int num_data, void + // *data); + llvm::Value *Args[] = { + CGF.Builder.CreateIntCast(getThreadID(CGF, Loc), CGM.IntTy, + /*isSigned=*/true), + llvm::ConstantInt::get(CGM.IntTy, Size, /*isSigned=*/true), + CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TaskRedInput.getPointer(), + CGM.VoidPtrTy)}; + return CGF.EmitRuntimeCall( + createRuntimeFunction(OMPRTL__kmpc_task_reduction_init), Args); +} + +void CGOpenMPRuntime::emitTaskReductionFixups(CodeGenFunction &CGF, + SourceLocation Loc, + ReductionCodeGen &RCG, + unsigned N) { + auto Sizes = RCG.getSizes(N); + // Emit threadprivate global variable if the type is non-constant + // (Sizes.second = nullptr). + if (Sizes.second) { + llvm::Value *SizeVal = CGF.Builder.CreateIntCast(Sizes.second, CGM.SizeTy, + /*isSigned=*/false); + Address SizeAddr = getAddrOfArtificialThreadPrivate( + CGF, CGM.getContext().getSizeType(), + generateUniqueName("reduction_size", Loc, N)); + CGF.Builder.CreateStore(SizeVal, SizeAddr, /*IsVolatile=*/false); + } + // Store address of the original reduction item if custom initializer is used. + if (RCG.usesReductionInitializer(N)) { + Address SharedAddr = getAddrOfArtificialThreadPrivate( + CGF, CGM.getContext().VoidPtrTy, + generateUniqueName("reduction", Loc, N)); + CGF.Builder.CreateStore( + CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + RCG.getSharedLValue(N).getPointer(), CGM.VoidPtrTy), + SharedAddr, /*IsVolatile=*/false); + } +} + +Address CGOpenMPRuntime::getTaskReductionItem(CodeGenFunction &CGF, + SourceLocation Loc, + llvm::Value *ReductionsPtr, + LValue SharedLVal) { + // Build call void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void + // *d); + llvm::Value *Args[] = { + CGF.Builder.CreateIntCast(getThreadID(CGF, Loc), CGM.IntTy, + /*isSigned=*/true), + ReductionsPtr, + CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(SharedLVal.getPointer(), + CGM.VoidPtrTy)}; + return Address( + CGF.EmitRuntimeCall( + createRuntimeFunction(OMPRTL__kmpc_task_reduction_get_th_data), Args), + SharedLVal.getAlignment()); +} + void CGOpenMPRuntime::emitTaskwaitCall(CodeGenFunction &CGF, SourceLocation Loc) { if (!CGF.HaveInsertPoint()) @@ -4874,25 +5734,45 @@ static const Stmt *ignoreCompoundStmts(const Stmt *Body) { return Body; } -/// \brief Emit the num_teams clause of an enclosed teams directive at the -/// target region scope. If there is no teams directive associated with the -/// target directive, or if there is no num_teams clause associated with the -/// enclosed teams directive, return nullptr. +/// Emit the number of teams for a target directive. Inspect the num_teams +/// clause associated with a teams construct combined or closely nested +/// with the target directive. +/// +/// Emit a team of size one for directives such as 'target parallel' that +/// have no associated teams construct. +/// +/// Otherwise, return nullptr. static llvm::Value * -emitNumTeamsClauseForTargetDirective(CGOpenMPRuntime &OMPRuntime, - CodeGenFunction &CGF, - const OMPExecutableDirective &D) { +emitNumTeamsForTargetDirective(CGOpenMPRuntime &OMPRuntime, + CodeGenFunction &CGF, + const OMPExecutableDirective &D) { assert(!CGF.getLangOpts().OpenMPIsDevice && "Clauses associated with the " "teams directive expected to be " "emitted only for the host!"); - // FIXME: For the moment we do not support combined directives with target and - // teams, so we do not expect to get any num_teams clause in the provided - // directive. Once we support that, this assertion can be replaced by the - // actual emission of the clause expression. - assert(D.getSingleClause<OMPNumTeamsClause>() == nullptr && - "Not expecting clause in directive."); + auto &Bld = CGF.Builder; + + // If the target directive is combined with a teams directive: + // Return the value in the num_teams clause, if any. + // Otherwise, return 0 to denote the runtime default. + if (isOpenMPTeamsDirective(D.getDirectiveKind())) { + if (const auto *NumTeamsClause = D.getSingleClause<OMPNumTeamsClause>()) { + CodeGenFunction::RunCleanupsScope NumTeamsScope(CGF); + auto NumTeams = CGF.EmitScalarExpr(NumTeamsClause->getNumTeams(), + /*IgnoreResultAssign*/ true); + return Bld.CreateIntCast(NumTeams, CGF.Int32Ty, + /*IsSigned=*/true); + } + + // The default value is 0. + return Bld.getInt32(0); + } + + // If the target directive is combined with a parallel directive but not a + // teams directive, start one team. + if (isOpenMPParallelDirective(D.getDirectiveKind())) + return Bld.getInt32(1); // If the current target region has a teams region enclosed, we need to get // the number of teams to pass to the runtime function call. This is done @@ -4910,38 +5790,92 @@ emitNumTeamsClauseForTargetDirective(CGOpenMPRuntime &OMPRuntime, CGOpenMPInnerExprInfo CGInfo(CGF, CS); CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo); llvm::Value *NumTeams = CGF.EmitScalarExpr(NTE->getNumTeams()); - return CGF.Builder.CreateIntCast(NumTeams, CGF.Int32Ty, - /*IsSigned=*/true); + return Bld.CreateIntCast(NumTeams, CGF.Int32Ty, + /*IsSigned=*/true); } // If we have an enclosed teams directive but no num_teams clause we use // the default value 0. - return CGF.Builder.getInt32(0); + return Bld.getInt32(0); } // No teams associated with the directive. return nullptr; } -/// \brief Emit the thread_limit clause of an enclosed teams directive at the -/// target region scope. If there is no teams directive associated with the -/// target directive, or if there is no thread_limit clause associated with the -/// enclosed teams directive, return nullptr. +/// Emit the number of threads for a target directive. Inspect the +/// thread_limit clause associated with a teams construct combined or closely +/// nested with the target directive. +/// +/// Emit the num_threads clause for directives such as 'target parallel' that +/// have no associated teams construct. +/// +/// Otherwise, return nullptr. static llvm::Value * -emitThreadLimitClauseForTargetDirective(CGOpenMPRuntime &OMPRuntime, - CodeGenFunction &CGF, - const OMPExecutableDirective &D) { +emitNumThreadsForTargetDirective(CGOpenMPRuntime &OMPRuntime, + CodeGenFunction &CGF, + const OMPExecutableDirective &D) { assert(!CGF.getLangOpts().OpenMPIsDevice && "Clauses associated with the " "teams directive expected to be " "emitted only for the host!"); - // FIXME: For the moment we do not support combined directives with target and - // teams, so we do not expect to get any thread_limit clause in the provided - // directive. Once we support that, this assertion can be replaced by the - // actual emission of the clause expression. - assert(D.getSingleClause<OMPThreadLimitClause>() == nullptr && - "Not expecting clause in directive."); + auto &Bld = CGF.Builder; + + // + // If the target directive is combined with a teams directive: + // Return the value in the thread_limit clause, if any. + // + // If the target directive is combined with a parallel directive: + // Return the value in the num_threads clause, if any. + // + // If both clauses are set, select the minimum of the two. + // + // If neither teams or parallel combined directives set the number of threads + // in a team, return 0 to denote the runtime default. + // + // If this is not a teams directive return nullptr. + + if (isOpenMPTeamsDirective(D.getDirectiveKind()) || + isOpenMPParallelDirective(D.getDirectiveKind())) { + llvm::Value *DefaultThreadLimitVal = Bld.getInt32(0); + llvm::Value *NumThreadsVal = nullptr; + llvm::Value *ThreadLimitVal = nullptr; + + if (const auto *ThreadLimitClause = + D.getSingleClause<OMPThreadLimitClause>()) { + CodeGenFunction::RunCleanupsScope ThreadLimitScope(CGF); + auto ThreadLimit = CGF.EmitScalarExpr(ThreadLimitClause->getThreadLimit(), + /*IgnoreResultAssign*/ true); + ThreadLimitVal = Bld.CreateIntCast(ThreadLimit, CGF.Int32Ty, + /*IsSigned=*/true); + } + + if (const auto *NumThreadsClause = + D.getSingleClause<OMPNumThreadsClause>()) { + CodeGenFunction::RunCleanupsScope NumThreadsScope(CGF); + llvm::Value *NumThreads = + CGF.EmitScalarExpr(NumThreadsClause->getNumThreads(), + /*IgnoreResultAssign*/ true); + NumThreadsVal = + Bld.CreateIntCast(NumThreads, CGF.Int32Ty, /*IsSigned=*/true); + } + + // Select the lesser of thread_limit and num_threads. + if (NumThreadsVal) + ThreadLimitVal = ThreadLimitVal + ? Bld.CreateSelect(Bld.CreateICmpSLT(NumThreadsVal, + ThreadLimitVal), + NumThreadsVal, ThreadLimitVal) + : NumThreadsVal; + + // Set default value passed to the runtime if either teams or a target + // parallel type directive is found but no clause is specified. + if (!ThreadLimitVal) + ThreadLimitVal = DefaultThreadLimitVal; + + return ThreadLimitVal; + } // If the current target region has a teams region enclosed, we need to get // the thread limit to pass to the runtime function call. This is done @@ -5494,7 +6428,7 @@ public: // We have to process the component lists that relate with the same // declaration in a single chunk so that we can generate the map flags // correctly. Therefore, we organize all lists in a map. - llvm::DenseMap<const ValueDecl *, SmallVector<MapInfo, 8>> Info; + llvm::MapVector<const ValueDecl *, SmallVector<MapInfo, 8>> Info; // Helper function to fill the information map for the different supported // clauses. @@ -5818,16 +6752,11 @@ emitOffloadingArrays(CodeGenFunction &CGF, for (unsigned i = 0; i < Info.NumberOfPtrs; ++i) { llvm::Value *BPVal = *BasePointers[i]; - if (BPVal->getType()->isPointerTy()) - BPVal = CGF.Builder.CreateBitCast(BPVal, CGM.VoidPtrTy); - else { - assert(BPVal->getType()->isIntegerTy() && - "If not a pointer, the value type must be an integer."); - BPVal = CGF.Builder.CreateIntToPtr(BPVal, CGM.VoidPtrTy); - } llvm::Value *BP = CGF.Builder.CreateConstInBoundsGEP2_32( llvm::ArrayType::get(CGM.VoidPtrTy, Info.NumberOfPtrs), Info.BasePointersArray, 0, i); + BP = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + BP, BPVal->getType()->getPointerTo(/*AddrSpace=*/0)); Address BPAddr(BP, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy)); CGF.Builder.CreateStore(BPVal, BPAddr); @@ -5836,16 +6765,11 @@ emitOffloadingArrays(CodeGenFunction &CGF, Info.CaptureDeviceAddrMap.insert(std::make_pair(DevVD, BPAddr)); llvm::Value *PVal = Pointers[i]; - if (PVal->getType()->isPointerTy()) - PVal = CGF.Builder.CreateBitCast(PVal, CGM.VoidPtrTy); - else { - assert(PVal->getType()->isIntegerTy() && - "If not a pointer, the value type must be an integer."); - PVal = CGF.Builder.CreateIntToPtr(PVal, CGM.VoidPtrTy); - } llvm::Value *P = CGF.Builder.CreateConstInBoundsGEP2_32( llvm::ArrayType::get(CGM.VoidPtrTy, Info.NumberOfPtrs), Info.PointersArray, 0, i); + P = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + P, PVal->getType()->getPointerTo(/*AddrSpace=*/0)); Address PAddr(P, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy)); CGF.Builder.CreateStore(PVal, PAddr); @@ -5984,8 +6908,8 @@ void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF, OffloadError); // Fill up the pointer arrays and transfer execution to the device. - auto &&ThenGen = [&Ctx, &BasePointers, &Pointers, &Sizes, &MapTypes, Device, - OutlinedFnID, OffloadError, OffloadErrorQType, + auto &&ThenGen = [&BasePointers, &Pointers, &Sizes, &MapTypes, Device, + OutlinedFnID, OffloadError, &D](CodeGenFunction &CGF, PrePostActionTy &) { auto &RT = CGF.CGM.getOpenMPRuntime(); // Emit the offloading arrays. @@ -6021,24 +6945,50 @@ void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF, // Return value of the runtime offloading call. llvm::Value *Return; - auto *NumTeams = emitNumTeamsClauseForTargetDirective(RT, CGF, D); - auto *ThreadLimit = emitThreadLimitClauseForTargetDirective(RT, CGF, D); + auto *NumTeams = emitNumTeamsForTargetDirective(RT, CGF, D); + auto *NumThreads = emitNumThreadsForTargetDirective(RT, CGF, D); - // If we have NumTeams defined this means that we have an enclosed teams - // region. Therefore we also expect to have ThreadLimit defined. These two - // values should be defined in the presence of a teams directive, regardless - // of having any clauses associated. If the user is using teams but no - // clauses, these two values will be the default that should be passed to - // the runtime library - a 32-bit integer with the value zero. + // The target region is an outlined function launched by the runtime + // via calls __tgt_target() or __tgt_target_teams(). + // + // __tgt_target() launches a target region with one team and one thread, + // executing a serial region. This master thread may in turn launch + // more threads within its team upon encountering a parallel region, + // however, no additional teams can be launched on the device. + // + // __tgt_target_teams() launches a target region with one or more teams, + // each with one or more threads. This call is required for target + // constructs such as: + // 'target teams' + // 'target' / 'teams' + // 'target teams distribute parallel for' + // 'target parallel' + // and so on. + // + // Note that on the host and CPU targets, the runtime implementation of + // these calls simply call the outlined function without forking threads. + // The outlined functions themselves have runtime calls to + // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by + // the compiler in emitTeamsCall() and emitParallelCall(). + // + // In contrast, on the NVPTX target, the implementation of + // __tgt_target_teams() launches a GPU kernel with the requested number + // of teams and threads so no additional calls to the runtime are required. if (NumTeams) { - assert(ThreadLimit && "Thread limit expression should be available along " - "with number of teams."); + // If we have NumTeams defined this means that we have an enclosed teams + // region. Therefore we also expect to have NumThreads defined. These two + // values should be defined in the presence of a teams directive, + // regardless of having any clauses associated. If the user is using teams + // but no clauses, these two values will be the default that should be + // passed to the runtime library - a 32-bit integer with the value zero. + assert(NumThreads && "Thread limit expression should be available along " + "with number of teams."); llvm::Value *OffloadingArgs[] = { DeviceID, OutlinedFnID, PointerNum, Info.BasePointersArray, Info.PointersArray, Info.SizesArray, Info.MapTypesArray, NumTeams, - ThreadLimit}; + NumThreads}; Return = CGF.EmitRuntimeCall( RT.createRuntimeFunction(OMPRTL__tgt_target_teams), OffloadingArgs); } else { @@ -6095,17 +7045,18 @@ void CGOpenMPRuntime::scanForTargetRegionsFunctions(const Stmt *S, if (!S) return; - // If we find a OMP target directive, codegen the outline function and - // register the result. - // FIXME: Add other directives with target when they become supported. - bool isTargetDirective = isa<OMPTargetDirective>(S); + // Codegen OMP target directives that offload compute to the device. + bool requiresDeviceCodegen = + isa<OMPExecutableDirective>(S) && + isOpenMPTargetExecutionDirective( + cast<OMPExecutableDirective>(S)->getDirectiveKind()); - if (isTargetDirective) { - auto *E = cast<OMPExecutableDirective>(S); + if (requiresDeviceCodegen) { + auto &E = *cast<OMPExecutableDirective>(S); unsigned DeviceID; unsigned FileID; unsigned Line; - getTargetEntryUniqueInfo(CGM.getContext(), E->getLocStart(), DeviceID, + getTargetEntryUniqueInfo(CGM.getContext(), E.getLocStart(), DeviceID, FileID, Line); // Is this a target region that should not be emitted as an entry point? If @@ -6114,13 +7065,22 @@ void CGOpenMPRuntime::scanForTargetRegionsFunctions(const Stmt *S, ParentName, Line)) return; - llvm::Function *Fn; - llvm::Constant *Addr; - std::tie(Fn, Addr) = - CodeGenFunction::EmitOMPTargetDirectiveOutlinedFunction( - CGM, cast<OMPTargetDirective>(*E), ParentName, - /*isOffloadEntry=*/true); - assert(Fn && Addr && "Target region emission failed."); + switch (S->getStmtClass()) { + case Stmt::OMPTargetDirectiveClass: + CodeGenFunction::EmitOMPTargetDeviceFunction( + CGM, ParentName, cast<OMPTargetDirective>(*S)); + break; + case Stmt::OMPTargetParallelDirectiveClass: + CodeGenFunction::EmitOMPTargetParallelDeviceFunction( + CGM, ParentName, cast<OMPTargetParallelDirective>(*S)); + break; + case Stmt::OMPTargetTeamsDirectiveClass: + CodeGenFunction::EmitOMPTargetTeamsDeviceFunction( + CGM, ParentName, cast<OMPTargetTeamsDirective>(*S)); + break; + default: + llvm_unreachable("Unknown target directive for OpenMP device codegen."); + } return; } @@ -6182,7 +7142,7 @@ bool CGOpenMPRuntime::emitTargetGlobalVariable(GlobalDecl GD) { } } - // If we are in target mode we do not emit any global (declare target is not + // If we are in target mode, we do not emit any global (declare target is not // implemented yet). Therefore we signal that GD was processed in this case. return true; } @@ -6271,8 +7231,8 @@ void CGOpenMPRuntime::emitTargetDataCalls( // Generate the code for the opening of the data environment. Capture all the // arguments of the runtime call by reference because they are used in the // closing of the region. - auto &&BeginThenGen = [&D, &CGF, Device, &Info, &CodeGen, &NoPrivAction]( - CodeGenFunction &CGF, PrePostActionTy &) { + auto &&BeginThenGen = [&D, Device, &Info, &CodeGen](CodeGenFunction &CGF, + PrePostActionTy &) { // Fill up the arrays with all the mapped variables. MappableExprsHandler::MapBaseValuesArrayTy BasePointers; MappableExprsHandler::MapValuesArrayTy Pointers; @@ -6318,8 +7278,7 @@ void CGOpenMPRuntime::emitTargetDataCalls( }; // Generate code for the closing of the data region. - auto &&EndThenGen = [&CGF, Device, &Info](CodeGenFunction &CGF, - PrePostActionTy &) { + auto &&EndThenGen = [Device, &Info](CodeGenFunction &CGF, PrePostActionTy &) { assert(Info.isValid() && "Invalid data environment closing arguments."); llvm::Value *BasePointersArrayArg = nullptr; @@ -6397,7 +7356,7 @@ void CGOpenMPRuntime::emitTargetDataStandAloneCall( "Expecting either target enter, exit data, or update directives."); // Generate the code for the opening of the data environment. - auto &&ThenGen = [&D, &CGF, Device](CodeGenFunction &CGF, PrePostActionTy &) { + auto &&ThenGen = [&D, Device](CodeGenFunction &CGF, PrePostActionTy &) { // Fill up the arrays with all the mapped variables. MappableExprsHandler::MapBaseValuesArrayTy BasePointers; MappableExprsHandler::MapValuesArrayTy Pointers; |