diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp | 271 |
1 files changed, 180 insertions, 91 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index c98d25e2..c68e586 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -13,8 +13,12 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -24,31 +28,34 @@ using namespace llvm; namespace { -class AMDGPUAnnotateKernelFeatures : public ModulePass { +class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { private: - static bool hasAddrSpaceCast(const Function &F); + const TargetMachine *TM = nullptr; + AMDGPUAS AS; - void addAttrToCallers(Function *Intrin, StringRef AttrName); - bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>); + bool addFeatureAttributes(Function &F); public: static char ID; - AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { } - bool runOnModule(Module &M) override; + AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} + + bool doInitialization(CallGraph &CG) override; + bool runOnSCC(CallGraphSCC &SCC) override; StringRef getPassName() const override { return "AMDGPU Annotate Kernel Features"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); - ModulePass::getAnalysisUsage(AU); + CallGraphSCCPass::getAnalysisUsage(AU); } - static bool visitConstantExpr(const ConstantExpr *CE); + static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS); static bool visitConstantExprsRecursively( const Constant *EntryC, - SmallPtrSet<const Constant *, 8> &ConstantExprVisited); + SmallPtrSet<const Constant *, 8> &ConstantExprVisited, + AMDGPUAS AS); }; } @@ -62,18 +69,20 @@ INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, // The queue ptr is only needed when casting to flat, not from it. -static bool castRequiresQueuePtr(unsigned SrcAS) { - return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; +static bool castRequiresQueuePtr(unsigned SrcAS, const AMDGPUAS &AS) { + return SrcAS == AS.LOCAL_ADDRESS || SrcAS == AS.PRIVATE_ADDRESS; } -static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { - return castRequiresQueuePtr(ASC->getSrcAddressSpace()); +static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC, + const AMDGPUAS &AS) { + return castRequiresQueuePtr(ASC->getSrcAddressSpace(), AS); } -bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { +bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE, + AMDGPUAS AS) { if (CE->getOpcode() == Instruction::AddrSpaceCast) { unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); - return castRequiresQueuePtr(SrcAS); + return castRequiresQueuePtr(SrcAS, AS); } return false; @@ -81,7 +90,8 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( const Constant *EntryC, - SmallPtrSet<const Constant *, 8> &ConstantExprVisited) { + SmallPtrSet<const Constant *, 8> &ConstantExprVisited, + AMDGPUAS AS) { if (!ConstantExprVisited.insert(EntryC).second) return false; @@ -94,7 +104,7 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( // Check this constant expression. if (const auto *CE = dyn_cast<ConstantExpr>(C)) { - if (visitConstantExpr(CE)) + if (visitConstantExpr(CE, AS)) return true; } @@ -114,15 +124,130 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( return false; } -// Return true if an addrspacecast is used that requires the queue ptr. -bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) { +// We do not need to note the x workitem or workgroup id because they are always +// initialized. +// +// TODO: We should not add the attributes if the known compile time workgroup +// size is 1 for y/z. +static StringRef intrinsicToAttrName(Intrinsic::ID ID, + bool &NonKernelOnly, + bool &IsQueuePtr) { + switch (ID) { + case Intrinsic::amdgcn_workitem_id_x: + NonKernelOnly = true; + return "amdgpu-work-item-id-x"; + case Intrinsic::amdgcn_workgroup_id_x: + NonKernelOnly = true; + return "amdgpu-work-group-id-x"; + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::r600_read_tidig_y: + return "amdgpu-work-item-id-y"; + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::r600_read_tidig_z: + return "amdgpu-work-item-id-z"; + case Intrinsic::amdgcn_workgroup_id_y: + case Intrinsic::r600_read_tgid_y: + return "amdgpu-work-group-id-y"; + case Intrinsic::amdgcn_workgroup_id_z: + case Intrinsic::r600_read_tgid_z: + return "amdgpu-work-group-id-z"; + case Intrinsic::amdgcn_dispatch_ptr: + return "amdgpu-dispatch-ptr"; + case Intrinsic::amdgcn_dispatch_id: + return "amdgpu-dispatch-id"; + case Intrinsic::amdgcn_kernarg_segment_ptr: + case Intrinsic::amdgcn_implicitarg_ptr: + return "amdgpu-kernarg-segment-ptr"; + case Intrinsic::amdgcn_queue_ptr: + case Intrinsic::trap: + case Intrinsic::debugtrap: + IsQueuePtr = true; + return "amdgpu-queue-ptr"; + default: + return ""; + } +} + +static bool handleAttr(Function &Parent, const Function &Callee, + StringRef Name) { + if (Callee.hasFnAttribute(Name)) { + Parent.addFnAttr(Name); + return true; + } + + return false; +} + +static void copyFeaturesToFunction(Function &Parent, const Function &Callee, + bool &NeedQueuePtr) { + // X ids unnecessarily propagated to kernels. + static const StringRef AttrNames[] = { + { "amdgpu-work-item-id-x" }, + { "amdgpu-work-item-id-y" }, + { "amdgpu-work-item-id-z" }, + { "amdgpu-work-group-id-x" }, + { "amdgpu-work-group-id-y" }, + { "amdgpu-work-group-id-z" }, + { "amdgpu-dispatch-ptr" }, + { "amdgpu-dispatch-id" }, + { "amdgpu-kernarg-segment-ptr" } + }; + + if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) + NeedQueuePtr = true; + + for (StringRef AttrName : AttrNames) + handleAttr(Parent, Callee, AttrName); +} + +bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); + bool HasFlat = ST.hasFlatAddressSpace(); + bool HasApertureRegs = ST.hasApertureRegs(); SmallPtrSet<const Constant *, 8> ConstantExprVisited; - for (const BasicBlock &BB : F) { - for (const Instruction &I : BB) { + bool Changed = false; + bool NeedQueuePtr = false; + bool HaveCall = false; + bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); + + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + CallSite CS(&I); + if (CS) { + Function *Callee = CS.getCalledFunction(); + + // TODO: Do something with indirect calls. + if (!Callee) { + if (!CS.isInlineAsm()) + HaveCall = true; + continue; + } + + Intrinsic::ID IID = Callee->getIntrinsicID(); + if (IID == Intrinsic::not_intrinsic) { + HaveCall = true; + copyFeaturesToFunction(F, *Callee, NeedQueuePtr); + Changed = true; + } else { + bool NonKernelOnly = false; + StringRef AttrName = intrinsicToAttrName(IID, + NonKernelOnly, NeedQueuePtr); + if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { + F.addFnAttr(AttrName); + Changed = true; + } + } + } + + if (NeedQueuePtr || HasApertureRegs) + continue; + if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { - if (castRequiresQueuePtr(ASC)) - return true; + if (castRequiresQueuePtr(ASC, AS)) { + NeedQueuePtr = true; + continue; + } } for (const Use &U : I.operands()) { @@ -130,93 +255,57 @@ bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) { if (!OpC) continue; - if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) - return true; + if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) { + NeedQueuePtr = true; + break; + } } } } - return false; -} - -void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin, - StringRef AttrName) { - SmallPtrSet<Function *, 4> SeenFuncs; - - for (User *U : Intrin->users()) { - // CallInst is the only valid user for an intrinsic. - CallInst *CI = cast<CallInst>(U); - - Function *CallingFunction = CI->getParent()->getParent(); - if (SeenFuncs.insert(CallingFunction).second) - CallingFunction->addFnAttr(AttrName); + if (NeedQueuePtr) { + F.addFnAttr("amdgpu-queue-ptr"); + Changed = true; } -} - -bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics( - Module &M, - ArrayRef<StringRef[2]> IntrinsicToAttr) { - bool Changed = false; - for (const StringRef *Arr : IntrinsicToAttr) { - if (Function *Fn = M.getFunction(Arr[0])) { - addAttrToCallers(Fn, Arr[1]); - Changed = true; - } + // TODO: We could refine this to captured pointers that could possibly be + // accessed by flat instructions. For now this is mostly a poor way of + // estimating whether there are calls before argument lowering. + if (HasFlat && !IsFunc && HaveCall) { + F.addFnAttr("amdgpu-flat-scratch"); + Changed = true; } return Changed; } -bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { +bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { + Module &M = SCC.getCallGraph().getModule(); Triple TT(M.getTargetTriple()); - static const StringRef IntrinsicToAttr[][2] = { - // .x omitted - { "llvm.amdgcn.workitem.id.y", "amdgpu-work-item-id-y" }, - { "llvm.amdgcn.workitem.id.z", "amdgpu-work-item-id-z" }, - - { "llvm.amdgcn.workgroup.id.y", "amdgpu-work-group-id-y" }, - { "llvm.amdgcn.workgroup.id.z", "amdgpu-work-group-id-z" }, - - { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" }, - { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" }, - - // .x omitted - { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" }, - { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" } - }; - - static const StringRef HSAIntrinsicToAttr[][2] = { - { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" }, - { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" }, - { "llvm.amdgcn.dispatch.id", "amdgpu-dispatch-id" } - }; - - // TODO: We should not add the attributes if the known compile time workgroup - // size is 1 for y/z. - - // TODO: Intrinsics that require queue ptr. + bool Changed = false; + for (CallGraphNode *I : SCC) { + Function *F = I->getFunction(); + if (!F || F->isDeclaration()) + continue; - // We do not need to note the x workitem or workgroup id because they are - // always initialized. + Changed |= addFeatureAttributes(*F); + } - bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr); - if (TT.getOS() == Triple::AMDHSA || TT.getOS() == Triple::Mesa3D) { - Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr); - for (Function &F : M) { - if (F.hasFnAttribute("amdgpu-queue-ptr")) - continue; + return Changed; +} - if (hasAddrSpaceCast(F)) - F.addFnAttr("amdgpu-queue-ptr"); - } - } +bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + report_fatal_error("TargetMachine is required"); - return Changed; + AS = AMDGPU::getAMDGPUAS(CG.getModule()); + TM = &TPC->getTM<TargetMachine>(); + return false; } -ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { +Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { return new AMDGPUAnnotateKernelFeatures(); } |