summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
authordim <dim@FreeBSD.org>2015-12-30 13:13:10 +0000
committerdim <dim@FreeBSD.org>2015-12-30 13:13:10 +0000
commit9b5bf5c4f53d65d6a48722d7410ed7cb15f5ba3a (patch)
treeb466a4817f79516eb1df8eae92bccf62ecc84003 /contrib/llvm/lib/Target/AMDGPU
parentf09a28d1de99fda4f5517fb12670fc36552f4927 (diff)
parente194cd6d03d91631334d9d5e55b506036f423cc8 (diff)
downloadFreeBSD-src-9b5bf5c4f53d65d6a48722d7410ed7cb15f5ba3a.zip
FreeBSD-src-9b5bf5c4f53d65d6a48722d7410ed7cb15f5ba3a.tar.gz
Update llvm to trunk r256633.
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU')
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPU.h16
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPU.td10
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp126
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp84
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp200
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h4
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp26
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h48
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp10
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h11
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp479
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp195
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h15
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp20
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h6
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td2
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td4
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td4
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp18
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp11
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h5
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp373
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp11
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h4
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp25
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h26
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp77
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h4
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp87
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h51
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp102
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h5
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp43
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp266
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/CIInstructions.td340
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td4
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td11
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp16
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h2
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp27
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp26
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h40
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h3
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp14
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h3
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp11
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp51
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h21
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp6
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp15
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Processors.td4
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp4
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp21
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp6
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h6
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600Instructions.td2
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp2
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp18
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h2
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp5
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIDefines.h3
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp6
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp229
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp133
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp204
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp243
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h34
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp660
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h12
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp88
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td44
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp1332
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h154
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td837
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIInstructions.td485
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp36
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp1
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp109
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h232
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp193
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp271
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h64
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td117
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SISchedule.td18
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp61
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp10
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp97
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h26
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/VIInstructions.td61
89 files changed, 6128 insertions, 2660 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
index 0a05d25..8c3cb56 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -44,15 +44,21 @@ FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
FunctionPass *createSIFixControlFlowLiveIntervalsPass();
-FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
+FunctionPass *createSIFixSGPRCopiesPass();
FunctionPass *createSIFixSGPRLiveRangesPass();
FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
FunctionPass *createSIInsertWaits(TargetMachine &tm);
-FunctionPass *createSIPrepareScratchRegs();
+
+ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
+void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
+extern char &AMDGPUAnnotateKernelFeaturesID;
void initializeSIFoldOperandsPass(PassRegistry &);
extern char &SIFoldOperandsID;
+void initializeSIFixSGPRCopiesPass(PassRegistry &);
+extern char &SIFixSGPRCopiesID;
+
void initializeSILowerI1CopiesPass(PassRegistry &);
extern char &SILowerI1CopiesID;
@@ -64,6 +70,8 @@ FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
Pass *createAMDGPUStructurizeCFGPass();
FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
ModulePass *createAMDGPUAlwaysInlinePass();
+ModulePass *createAMDGPUOpenCLImageTypeLoweringPass();
+FunctionPass *createAMDGPUAnnotateUniformValues();
void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&);
extern char &SIFixControlFlowLiveIntervalsID;
@@ -71,6 +79,8 @@ extern char &SIFixControlFlowLiveIntervalsID;
void initializeSIFixSGPRLiveRangesPass(PassRegistry&);
extern char &SIFixSGPRLiveRangesID;
+void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&);
+extern char &AMDGPUAnnotateUniformValuesPassID;
extern Target TheAMDGPUTarget;
extern Target TheGCNTarget;
@@ -85,8 +95,6 @@ enum TargetIndex {
};
}
-#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel"
-
} // End namespace llvm
namespace ShaderType {
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
index 68b5050..d4af8d2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -108,6 +108,11 @@ def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <"unsafe-ds-offset-fol
"true",
"Force using DS instruction immediate offsets on SI">;
+def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
+ "FlatForGlobal",
+ "true",
+ "Force to generate flat instruction for global">;
+
def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
"FlatAddressSpace",
"true",
@@ -272,9 +277,14 @@ def isSICI : Predicate<
"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
>, AssemblerPredicate<"FeatureGCN1Encoding">;
+def isVI : Predicate <
+ "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
+ AssemblerPredicate<"FeatureGCN3Encoding">;
+
class PredicateControl {
Predicate SubtargetPredicate;
Predicate SIAssemblerPredicate = isSICI;
+ Predicate VIAssemblerPredicate = isVI;
list<Predicate> AssemblerPredicates = [];
Predicate AssemblerPredicate = TruePredicate;
list<Predicate> OtherPredicates = [];
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
new file mode 100644
index 0000000..3781839
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -0,0 +1,126 @@
+//===-- AMDGPUAnnotateKernelFeaturesPass.cpp ------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass adds target attributes to functions which use intrinsics
+/// which will impact calling convention lowering.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+
+#define DEBUG_TYPE "amdgpu-annotate-kernel-features"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAnnotateKernelFeatures : public ModulePass {
+private:
+ void addAttrToCallers(Function *Intrin, StringRef AttrName);
+ bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>);
+
+public:
+ static char ID;
+
+ AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { }
+ bool runOnModule(Module &M) override;
+ const char *getPassName() const override {
+ return "AMDGPU Annotate Kernel Features";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ ModulePass::getAnalysisUsage(AU);
+ }
+};
+
+}
+
+char AMDGPUAnnotateKernelFeatures::ID = 0;
+
+char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
+
+
+INITIALIZE_PASS_BEGIN(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
+ "Add AMDGPU function attributes", false, false)
+INITIALIZE_PASS_END(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
+ "Add AMDGPU function attributes", false, false)
+
+
+void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin,
+ StringRef AttrName) {
+ SmallPtrSet<Function *, 4> SeenFuncs;
+
+ for (User *U : Intrin->users()) {
+ // CallInst is the only valid user for an intrinsic.
+ CallInst *CI = cast<CallInst>(U);
+
+ Function *CallingFunction = CI->getParent()->getParent();
+ if (SeenFuncs.insert(CallingFunction).second)
+ CallingFunction->addFnAttr(AttrName);
+ }
+}
+
+bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics(
+ Module &M,
+ ArrayRef<StringRef[2]> IntrinsicToAttr) {
+ bool Changed = false;
+
+ for (const StringRef *Arr : IntrinsicToAttr) {
+ if (Function *Fn = M.getFunction(Arr[0])) {
+ addAttrToCallers(Fn, Arr[1]);
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
+bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
+ Triple TT(M.getTargetTriple());
+
+ static const StringRef IntrinsicToAttr[][2] = {
+ // .x omitted
+ { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" },
+ { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" },
+
+ // .x omitted
+ { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" },
+ { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" }
+
+ };
+
+ static const StringRef HSAIntrinsicToAttr[][2] = {
+ { "llvm.r600.read.local.size.x", "amdgpu-dispatch-ptr" },
+ { "llvm.r600.read.local.size.y", "amdgpu-dispatch-ptr" },
+ { "llvm.r600.read.local.size.z", "amdgpu-dispatch-ptr" },
+
+ { "llvm.r600.read.global.size.x", "amdgpu-dispatch-ptr" },
+ { "llvm.r600.read.global.size.y", "amdgpu-dispatch-ptr" },
+ { "llvm.r600.read.global.size.z", "amdgpu-dispatch-ptr" },
+ { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" }
+ };
+
+ // TODO: Intrinsics that require queue ptr.
+
+ // We do not need to note the x workitem or workgroup id because they are
+ // always initialized.
+
+ bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr);
+ if (TT.getOS() == Triple::AMDHSA)
+ Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr);
+
+ return Changed;
+}
+
+ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
+ return new AMDGPUAnnotateKernelFeatures();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
new file mode 100644
index 0000000..dfddc34
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -0,0 +1,84 @@
+//===-- AMDGPUAnnotateUniformValues.cpp - ---------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass adds amdgpu.uniform metadata to IR values so this information
+/// can be used during instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-annotate-uniform"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAnnotateUniformValues : public FunctionPass,
+ public InstVisitor<AMDGPUAnnotateUniformValues> {
+ DivergenceAnalysis *DA;
+
+public:
+ static char ID;
+ AMDGPUAnnotateUniformValues() :
+ FunctionPass(ID) { }
+ bool doInitialization(Module &M) override;
+ bool runOnFunction(Function &F) override;
+ const char *getPassName() const override { return "AMDGPU Annotate Uniform Values"; }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DivergenceAnalysis>();
+ AU.setPreservesAll();
+ }
+
+ void visitLoadInst(LoadInst &I);
+
+};
+
+} // End anonymous namespace
+
+INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
+ "Add AMDGPU uniform metadata", false, false)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
+ "Add AMDGPU uniform metadata", false, false)
+
+char AMDGPUAnnotateUniformValues::ID = 0;
+
+void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
+ Value *Ptr = I.getPointerOperand();
+ if (!DA->isUniform(Ptr))
+ return;
+
+ if (Instruction *PtrI = dyn_cast<Instruction>(Ptr))
+ PtrI->setMetadata("amdgpu.uniform", MDNode::get(I.getContext(), {}));
+
+}
+
+bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
+ return false;
+}
+
+bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
+ DA = &getAnalysis<DivergenceAnalysis>();
+ visit(F);
+
+ return true;
+}
+
+FunctionPass *
+llvm::createAMDGPUAnnotateUniformValues() {
+ return new AMDGPUAnnotateUniformValues();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 0a5309b..ba71dc0 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -100,14 +100,63 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
}
}
-void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
-
- // This label is used to mark the end of the .text section.
- const TargetLoweringObjectFile &TLOF = getObjFileLowering();
- OutStreamer->SwitchSection(TLOF.getTextSection());
- MCSymbol *EndOfTextLabel =
- OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
- OutStreamer->EmitLabel(EndOfTextLabel);
+void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
+ const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
+ if (MFI->isKernel() && STM.isAmdHsaOS()) {
+ AMDGPUTargetStreamer *TS =
+ static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+ TS->EmitAMDGPUSymbolType(CurrentFnSym->getName(),
+ ELF::STT_AMDGPU_HSA_KERNEL);
+ }
+
+ AsmPrinter::EmitFunctionEntryLabel();
+}
+
+static bool isModuleLinkage(const GlobalValue *GV) {
+ switch (GV->getLinkage()) {
+ case GlobalValue::InternalLinkage:
+ case GlobalValue::CommonLinkage:
+ return true;
+ case GlobalValue::ExternalLinkage:
+ return false;
+ default: llvm_unreachable("unknown linkage type");
+ }
+}
+
+void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+
+ if (TM.getTargetTriple().getOS() != Triple::AMDHSA) {
+ AsmPrinter::EmitGlobalVariable(GV);
+ return;
+ }
+
+ if (GV->isDeclaration() || GV->getLinkage() == GlobalValue::PrivateLinkage) {
+ AsmPrinter::EmitGlobalVariable(GV);
+ return;
+ }
+
+ // Group segment variables aren't emitted in HSA.
+ if (AMDGPU::isGroupSegment(GV))
+ return;
+
+ AMDGPUTargetStreamer *TS =
+ static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+ if (isModuleLinkage(GV)) {
+ TS->EmitAMDGPUHsaModuleScopeGlobal(GV->getName());
+ } else {
+ TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName());
+ }
+
+ const DataLayout &DL = getDataLayout();
+ OutStreamer->PushSection();
+ OutStreamer->SwitchSection(
+ getObjFileLowering().SectionForGlobal(GV, *Mang, TM));
+ MCSymbol *GVSym = getSymbol(GV);
+ const Constant *C = GV->getInitializer();
+ OutStreamer->EmitLabel(GVSym);
+ EmitGlobalConstant(DL, C);
+ OutStreamer->PopSection();
}
bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
@@ -125,8 +174,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
SIProgramInfo KernelInfo;
if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ getSIProgramInfo(KernelInfo, MF);
if (!STM.isAmdHsaOS()) {
- getSIProgramInfo(KernelInfo, MF);
EmitProgramInfoSI(MF, KernelInfo);
}
// Emit directives
@@ -165,6 +214,23 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
false);
OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
false);
+
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
+ Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
+ false);
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
+ Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)),
+ false);
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
+ Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)),
+ false);
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
+ Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)),
+ false);
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
+ Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)),
+ false);
+
} else {
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
OutStreamer->emitRawComment(
@@ -278,27 +344,30 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
unsigned width = 0;
bool isSGPR = false;
- if (!MO.isReg()) {
+ if (!MO.isReg())
continue;
- }
+
unsigned reg = MO.getReg();
- if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO ||
- reg == AMDGPU::VCC_HI) {
+ switch (reg) {
+ case AMDGPU::EXEC:
+ case AMDGPU::SCC:
+ case AMDGPU::M0:
+ continue;
+
+ case AMDGPU::VCC:
+ case AMDGPU::VCC_LO:
+ case AMDGPU::VCC_HI:
VCCUsed = true;
continue;
- } else if (reg == AMDGPU::FLAT_SCR ||
- reg == AMDGPU::FLAT_SCR_LO ||
- reg == AMDGPU::FLAT_SCR_HI) {
+
+ case AMDGPU::FLAT_SCR:
+ case AMDGPU::FLAT_SCR_LO:
+ case AMDGPU::FLAT_SCR_HI:
FlatUsed = true;
continue;
- }
- switch (reg) {
- default: break;
- case AMDGPU::SCC:
- case AMDGPU::EXEC:
- case AMDGPU::M0:
- continue;
+ default:
+ break;
}
if (AMDGPU::SReg_32RegClass.contains(reg)) {
@@ -348,11 +417,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
}
}
- if (VCCUsed)
+ if (VCCUsed || FlatUsed)
MaxSGPR += 2;
- if (FlatUsed)
+ if (FlatUsed) {
MaxSGPR += 2;
+ // 2 additional for VI+.
+ if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ MaxSGPR += 2;
+ }
// We found the maximum register index. They start at 0, so add one to get the
// number of registers.
@@ -368,6 +441,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
}
+ if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
+ LLVMContext &Ctx = MF.getFunction()->getContext();
+ Ctx.emitError("too many user SGPRs used");
+ }
+
ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
// Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
@@ -419,18 +497,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
S_00B848_PRIV(ProgInfo.Priv) |
S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
- S_00B848_IEEE_MODE(ProgInfo.DebugMode) |
+ S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
+ // 0 = X, 1 = XY, 2 = XYZ
+ unsigned TIDIGCompCnt = 0;
+ if (MFI->hasWorkItemIDZ())
+ TIDIGCompCnt = 2;
+ else if (MFI->hasWorkItemIDY())
+ TIDIGCompCnt = 1;
+
ProgInfo.ComputePGMRSrc2 =
S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
- S_00B84C_USER_SGPR(MFI->NumUserSGPRs) |
- S_00B84C_TGID_X_EN(1) |
- S_00B84C_TGID_Y_EN(1) |
- S_00B84C_TGID_Z_EN(1) |
- S_00B84C_TG_SIZE_EN(1) |
- S_00B84C_TIDIG_COMP_CNT(2) |
- S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks);
+ S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
+ S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
+ S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
+ S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
+ S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
+ S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
+ S_00B84C_EXCP_EN_MSB(0) |
+ S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) |
+ S_00B84C_EXCP_EN(0);
}
static unsigned getRsrcReg(unsigned ShaderType) {
@@ -491,14 +578,53 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
header.compute_pgm_resource_registers =
KernelInfo.ComputePGMRSrc1 |
(KernelInfo.ComputePGMRSrc2 << 32);
- header.code_properties =
- AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR |
- AMD_CODE_PROPERTY_IS_PTR64;
+ header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
+
+ if (MFI->hasPrivateSegmentBuffer()) {
+ header.code_properties |=
+ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
+ }
+
+ if (MFI->hasDispatchPtr())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+
+ if (MFI->hasQueuePtr())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
+
+ if (MFI->hasKernargSegmentPtr())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
+
+ if (MFI->hasDispatchID())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
+
+ if (MFI->hasFlatScratchInit())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+
+ // TODO: Private segment size
+
+ if (MFI->hasGridWorkgroupCountX()) {
+ header.code_properties |=
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
+ }
+
+ if (MFI->hasGridWorkgroupCountY()) {
+ header.code_properties |=
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
+ }
+
+ if (MFI->hasGridWorkgroupCountZ()) {
+ header.code_properties |=
+ AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
+ }
+
+ if (MFI->hasDispatchPtr())
+ header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
header.kernarg_segment_byte_size = MFI->ABIArgOffset;
header.wavefront_sgpr_count = KernelInfo.NumSGPR;
header.workitem_vgpr_count = KernelInfo.NumVGPR;
-
+ header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
+ header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
AMDGPUTargetStreamer *TS =
static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 345af9b..817cbfc 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -99,7 +99,9 @@ public:
void EmitFunctionBodyStart() override;
- void EmitEndOfAsmFile(Module &M) override;
+ void EmitFunctionEntryLabel() override;
+
+ void EmitGlobalVariable(const GlobalVariable *GV) override;
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
unsigned AsmVariant, const char *ExtraCode,
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp
new file mode 100644
index 0000000..2f6b302
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp
@@ -0,0 +1,26 @@
+//===-- AMDGPUDiagnosticInfoUnsupported.cpp -------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUDiagnosticInfoUnsupported.h"
+
+using namespace llvm;
+
+DiagnosticInfoUnsupported::DiagnosticInfoUnsupported(
+ const Function &Fn,
+ const Twine &Desc,
+ DiagnosticSeverity Severity)
+ : DiagnosticInfo(getKindID(), Severity),
+ Description(Desc),
+ Fn(Fn) { }
+
+int DiagnosticInfoUnsupported::KindID = 0;
+
+void DiagnosticInfoUnsupported::print(DiagnosticPrinter &DP) const {
+ DP << "unsupported " << getDescription() << " in " << Fn.getName();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h
new file mode 100644
index 0000000..0fd37e1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h
@@ -0,0 +1,48 @@
+//===-- AMDGPUDiagnosticInfoUnsupported.h - Error reporting -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H
+
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+
+namespace llvm {
+
+/// Diagnostic information for unimplemented or unsupported feature reporting.
+class DiagnosticInfoUnsupported : public DiagnosticInfo {
+private:
+ const Twine &Description;
+ const Function &Fn;
+
+ static int KindID;
+
+ static int getKindID() {
+ if (KindID == 0)
+ KindID = llvm::getNextAvailablePluginDiagnosticKind();
+ return KindID;
+ }
+
+public:
+ DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
+ DiagnosticSeverity Severity = DS_Error);
+
+ const Function &getFunction() const { return Fn; }
+ const Twine &getDescription() const { return Description; }
+
+ void print(DiagnosticPrinter &DP) const override;
+
+ static bool classof(const DiagnosticInfo *DI) {
+ return DI->getKind() == getKindID();
+ }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
index 8175786..4d84d28 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
@@ -71,9 +71,15 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
}
/// \returns The number of registers allocated for \p FI.
-int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
- int FI) const {
+int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+ int FI,
+ unsigned &FrameReg) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+
+ // Fill in FrameReg output argument.
+ FrameReg = RI->getFrameRegister(MF);
+
// Start the offset at 2 so we don't overwrite work group information.
// XXX: We should only do this when the shader actually uses this
// information.
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 9f31be1..257a3da 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -8,14 +8,12 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Interface to describe a layout of a stack frame on a AMDIL target
-/// machine.
+/// \brief Interface to describe a layout of a stack frame on an AMDGPU target.
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
-#define LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H
-#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/Target/TargetFrameLowering.h"
namespace llvm {
@@ -34,7 +32,8 @@ public:
/// \returns The number of 32-bit sub-registers that are used when storing
/// values to the stack.
unsigned getStackWidth(const MachineFunction &MF) const;
- int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+ int getFrameIndexReference(const MachineFunction &MF, int FI,
+ unsigned &FrameReg) const override;
const SpillSlot *
getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 64c54cc..b33040b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -11,6 +11,8 @@
/// \brief Defines an instruction selector for the AMDGPU target.
//
//===----------------------------------------------------------------------===//
+
+#include "AMDGPUDiagnosticInfoUnsupported.h"
#include "AMDGPUInstrInfo.h"
#include "AMDGPUISelLowering.h" // For AMDGPUISD
#include "AMDGPURegisterInfo.h"
@@ -20,9 +22,9 @@
#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/IR/Function.h"
@@ -40,12 +42,14 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
// Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
// make the right decision when generating code for different targets.
const AMDGPUSubtarget *Subtarget;
+
public:
AMDGPUDAGToDAGISel(TargetMachine &TM);
virtual ~AMDGPUDAGToDAGISel();
bool runOnMachineFunction(MachineFunction &MF) override;
SDNode *Select(SDNode *N) override;
const char *getPassName() const override;
+ void PreprocessISelDAG() override;
void PostprocessISelDAG() override;
private:
@@ -91,7 +95,7 @@ private:
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
SDValue &Offset1) const;
- void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+ bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
SDValue &SOffset, SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
SDValue &TFE) const;
@@ -108,6 +112,16 @@ private:
SDValue &TFE) const;
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset, SDValue &GLC) const;
+ bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
+ bool &Imm) const;
+ bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
+ bool &Imm) const;
+ bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+ bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+ bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+ bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
+ bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
+ bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
SDNode *SelectAddrSpaceCast(SDNode *N);
bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
@@ -273,6 +287,23 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
return N;
}
+static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
+ switch (NumVectorElts) {
+ case 1:
+ return AMDGPU::SReg_32RegClassID;
+ case 2:
+ return AMDGPU::SReg_64RegClassID;
+ case 4:
+ return AMDGPU::SReg_128RegClassID;
+ case 8:
+ return AMDGPU::SReg_256RegClassID;
+ case 16:
+ return AMDGPU::SReg_512RegClassID;
+ }
+
+ llvm_unreachable("invalid vector size");
+}
+
SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
unsigned int Opc = N->getOpcode();
if (N->isMachineOpcode()) {
@@ -306,38 +337,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
EVT EltVT = VT.getVectorElementType();
assert(EltVT.bitsEq(MVT::i32));
if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
- bool UseVReg = true;
- for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
- U != E; ++U) {
- if (!U->isMachineOpcode()) {
- continue;
- }
- const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
- if (!RC) {
- continue;
- }
- if (static_cast<const SIRegisterInfo *>(TRI)->isSGPRClass(RC)) {
- UseVReg = false;
- }
- }
- switch(NumVectorElts) {
- case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID :
- AMDGPU::SReg_32RegClassID;
- break;
- case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID :
- AMDGPU::SReg_64RegClassID;
- break;
- case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID :
- AMDGPU::SReg_128RegClassID;
- break;
- case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID :
- AMDGPU::SReg_256RegClassID;
- break;
- case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID :
- AMDGPU::SReg_512RegClassID;
- break;
- default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
- }
+ RegClassID = selectSGPRVectorRegClassID(NumVectorElts);
} else {
// BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
// that adds a 128 bits reg copy when going through TwoAddressInstructions
@@ -455,98 +455,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
N->getValueType(0), Ops);
}
-
- case ISD::LOAD: {
- LoadSDNode *LD = cast<LoadSDNode>(N);
- SDLoc SL(N);
- EVT VT = N->getValueType(0);
-
- if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) {
- N = glueCopyToM0(N);
- break;
- }
-
- // To simplify the TableGen patters, we replace all i64 loads with
- // v2i32 loads. Alternatively, we could promote i64 loads to v2i32
- // during DAG legalization, however, so places (ExpandUnalignedLoad)
- // in the DAG legalizer assume that if i64 is legal, so doing this
- // promotion early can cause problems.
-
- SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(),
- LD->getBasePtr(), LD->getMemOperand());
- SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL,
- MVT::i64, NewLoad);
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1));
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast);
- SDNode *Load = glueCopyToM0(NewLoad.getNode());
- SelectCode(Load);
- N = BitCast.getNode();
- break;
- }
-
+ case ISD::LOAD:
case ISD::STORE: {
- // Handle i64 stores here for the same reason mentioned above for loads.
- StoreSDNode *ST = cast<StoreSDNode>(N);
- SDValue Value = ST->getValue();
- if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) {
-
- SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N),
- MVT::v2i32, Value);
- SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue,
- ST->getBasePtr(), ST->getMemOperand());
-
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore);
-
- if (NewValue.getOpcode() == ISD::BITCAST) {
- Select(NewStore.getNode());
- return SelectCode(NewValue.getNode());
- }
-
- // getNode() may fold the bitcast if its input was another bitcast. If that
- // happens we should only select the new store.
- N = NewStore.getNode();
- }
-
N = glueCopyToM0(N);
break;
}
- case AMDGPUISD::REGISTER_LOAD: {
- if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
- break;
- SDValue Addr, Offset;
-
- SDLoc DL(N);
- SelectADDRIndirect(N->getOperand(1), Addr, Offset);
- const SDValue Ops[] = {
- Addr,
- Offset,
- CurDAG->getTargetConstant(0, DL, MVT::i32),
- N->getOperand(0),
- };
- return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL,
- CurDAG->getVTList(MVT::i32, MVT::i64,
- MVT::Other),
- Ops);
- }
- case AMDGPUISD::REGISTER_STORE: {
- if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
- break;
- SDValue Addr, Offset;
- SelectADDRIndirect(N->getOperand(2), Addr, Offset);
- SDLoc DL(N);
- const SDValue Ops[] = {
- N->getOperand(1),
- Addr,
- Offset,
- CurDAG->getTargetConstant(0, DL, MVT::i32),
- N->getOperand(0),
- };
- return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL,
- CurDAG->getVTList(MVT::Other),
- Ops);
- }
-
case AMDGPUISD::BFE_I32:
case AMDGPUISD::BFE_U32: {
if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
@@ -575,7 +489,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N),
N->getOperand(0), OffsetVal, WidthVal);
-
}
case AMDGPUISD::DIV_SCALE: {
return SelectDIV_SCALE(N);
@@ -601,7 +514,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
return SelectCode(N);
}
-
bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) {
assert(AS != 0 && "Use checkPrivateAddress instead.");
if (!Ptr)
@@ -681,7 +593,7 @@ bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
if (checkPrivateAddress(N->getMemOperand())) {
if (MMO) {
const PseudoSourceValue *PSV = MMO->getPseudoValue();
- if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
+ if (PSV && PSV->isConstantPool()) {
return true;
}
}
@@ -847,7 +759,8 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
unsigned Opc
= (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
- // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
+ // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
+ // omod
SDValue Ops[8];
SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
@@ -883,15 +796,39 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
Offset = N1;
return true;
}
- }
+ } else if (Addr.getOpcode() == ISD::SUB) {
+ // sub C, x -> add (sub 0, x), C
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
+ int64_t ByteOffset = C->getSExtValue();
+ if (isUInt<16>(ByteOffset)) {
+ SDLoc DL(Addr);
+ SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+ // XXX - This is kind of hacky. Create a dummy sub node so we can check
+ // the known bits in isDSOffsetLegal. We need to emit the selected node
+ // here, so this is thrown away.
+ SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
+ Zero, Addr.getOperand(1));
+
+ if (isDSOffsetLegal(Sub, ByteOffset, 16)) {
+ MachineSDNode *MachineSub
+ = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32,
+ Zero, Addr.getOperand(1));
+
+ Base = SDValue(MachineSub, 0);
+ Offset = Addr.getOperand(0);
+ return true;
+ }
+ }
+ }
+ } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+ // If we have a constant address, prefer to put the constant into the
+ // offset. This can save moves to load the constant address since multiple
+ // operations can share the zero base address register, and enables merging
+ // into read2 / write2 instructions.
- SDLoc DL(Addr);
+ SDLoc DL(Addr);
- // If we have a constant address, prefer to put the constant into the
- // offset. This can save moves to load the constant address since multiple
- // operations can share the zero base address register, and enables merging
- // into read2 / write2 instructions.
- if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
if (isUInt<16>(CAddr->getZExtValue())) {
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
@@ -904,10 +841,11 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
// default case
Base = Addr;
- Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
return true;
}
+// TODO: If offset is too big, put low 16-bit into offset.
bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
SDValue &Offset0,
SDValue &Offset1) const {
@@ -926,9 +864,35 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
return true;
}
- }
-
- if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+ } else if (Addr.getOpcode() == ISD::SUB) {
+ // sub C, x -> add (sub 0, x), C
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
+ unsigned DWordOffset0 = C->getZExtValue() / 4;
+ unsigned DWordOffset1 = DWordOffset0 + 1;
+
+ if (isUInt<8>(DWordOffset0)) {
+ SDLoc DL(Addr);
+ SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+ // XXX - This is kind of hacky. Create a dummy sub node so we can check
+ // the known bits in isDSOffsetLegal. We need to emit the selected node
+ // here, so this is thrown away.
+ SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
+ Zero, Addr.getOperand(1));
+
+ if (isDSOffsetLegal(Sub, DWordOffset1, 8)) {
+ MachineSDNode *MachineSub
+ = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32,
+ Zero, Addr.getOperand(1));
+
+ Base = SDValue(MachineSub, 0);
+ Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8);
+ Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8);
+ return true;
+ }
+ }
+ }
+ } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
unsigned DWordOffset0 = CAddr->getZExtValue() / 4;
unsigned DWordOffset1 = DWordOffset0 + 1;
assert(4 * DWordOffset0 == CAddr->getZExtValue());
@@ -956,12 +920,16 @@ static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
return isUInt<12>(Imm->getZExtValue());
}
-void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
+bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
SDValue &VAddr, SDValue &SOffset,
SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &Addr64,
SDValue &GLC, SDValue &SLC,
SDValue &TFE) const {
+ // Subtarget prefers to use flat instruction
+ if (Subtarget->useFlatForGlobal())
+ return false;
+
SDLoc DL(Addr);
GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
@@ -994,14 +962,14 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
if (isLegalMUBUFImmOffset(C1)) {
Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
- return;
+ return true;
} else if (isUInt<32>(C1->getZExtValue())) {
// Illegal offset, store it in soffset.
Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
0);
- return;
+ return true;
}
}
@@ -1013,7 +981,7 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
Ptr = N0;
VAddr = N1;
Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
- return;
+ return true;
}
// default case -> offset
@@ -1021,6 +989,7 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
Ptr = Addr;
Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ return true;
}
bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
@@ -1033,8 +1002,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return false;
- SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE);
+ if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+ GLC, SLC, TFE))
+ return false;
ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
if (C->getSExtValue()) {
@@ -1052,8 +1022,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &VAddr, SDValue &SOffset,
- SDValue &Offset,
- SDValue &SLC) const {
+ SDValue &Offset,
+ SDValue &SLC) const {
SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
SDValue GLC, TFE;
@@ -1066,36 +1036,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
SDLoc DL(Addr);
MachineFunction &MF = CurDAG->getMachineFunction();
- const SIRegisterInfo *TRI =
- static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const SITargetLowering& Lowering =
- *static_cast<const SITargetLowering*>(getTargetLowering());
-
- unsigned ScratchOffsetReg =
- TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
- Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass,
- ScratchOffsetReg, MVT::i32);
- SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32);
- SDValue ScratchRsrcDword0 =
- SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0);
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32);
- SDValue ScratchRsrcDword1 =
- SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0);
-
- const SDValue RsrcOps[] = {
- CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
- ScratchRsrcDword0,
- CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
- ScratchRsrcDword1,
- CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
- };
- SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
- MVT::v2i32, RsrcOps), 0);
- Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0);
- SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
- MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
+ Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
+ SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);
// (add n0, c1)
if (CurDAG->isBaseWithConstantOffset(Addr)) {
@@ -1126,8 +1070,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
const SIInstrInfo *TII =
static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
- SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE);
+ if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+ GLC, SLC, TFE))
+ return false;
if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
!cast<ConstantSDNode>(Idxen)->getSExtValue() &&
@@ -1153,18 +1098,134 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
}
+///
+/// \param EncodedOffset This is the immediate value that will be encoded
+/// directly into the instruction. On SI/CI the \p EncodedOffset
+/// will be in units of dwords and on VI+ it will be units of bytes.
+static bool isLegalSMRDImmOffset(const AMDGPUSubtarget *ST,
+ int64_t EncodedOffset) {
+ return ST->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ?
+ isUInt<8>(EncodedOffset) : isUInt<20>(EncodedOffset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
+ SDValue &Offset, bool &Imm) const {
+
+ // FIXME: Handle non-constant offsets.
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
+ if (!C)
+ return false;
+
+ SDLoc SL(ByteOffsetNode);
+ AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration();
+ int64_t ByteOffset = C->getSExtValue();
+ int64_t EncodedOffset = Gen < AMDGPUSubtarget::VOLCANIC_ISLANDS ?
+ ByteOffset >> 2 : ByteOffset;
+
+ if (isLegalSMRDImmOffset(Subtarget, EncodedOffset)) {
+ Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
+ Imm = true;
+ return true;
+ }
+
+ if (!isUInt<32>(EncodedOffset) || !isUInt<32>(ByteOffset))
+ return false;
+
+ if (Gen == AMDGPUSubtarget::SEA_ISLANDS && isUInt<32>(EncodedOffset)) {
+ // 32-bit Immediates are supported on Sea Islands.
+ Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
+ } else {
+ SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
+ Offset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32,
+ C32Bit), 0);
+ }
+ Imm = false;
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
+ SDValue &Offset, bool &Imm) const {
+
+ SDLoc SL(Addr);
+ if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ SDValue N0 = Addr.getOperand(0);
+ SDValue N1 = Addr.getOperand(1);
+
+ if (SelectSMRDOffset(N1, Offset, Imm)) {
+ SBase = N0;
+ return true;
+ }
+ }
+ SBase = Addr;
+ Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
+ Imm = true;
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
+ SDValue &Offset) const {
+ bool Imm;
+ return SelectSMRD(Addr, SBase, Offset, Imm) && Imm;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
+ SDValue &Offset) const {
+
+ if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
+ return false;
+
+ bool Imm;
+ if (!SelectSMRD(Addr, SBase, Offset, Imm))
+ return false;
+
+ return !Imm && isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
+ SDValue &Offset) const {
+ bool Imm;
+ return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm &&
+ !isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
+ SDValue &Offset) const {
+ bool Imm;
+ return SelectSMRDOffset(Addr, Offset, Imm) && Imm;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
+ SDValue &Offset) const {
+ if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
+ return false;
+
+ bool Imm;
+ if (!SelectSMRDOffset(Addr, Offset, Imm))
+ return false;
+
+ return !Imm && isa<ConstantSDNode>(Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr,
+ SDValue &Offset) const {
+ bool Imm;
+ return SelectSMRDOffset(Addr, Offset, Imm) && !Imm &&
+ !isa<ConstantSDNode>(Offset);
+}
+
// FIXME: This is incorrect and only enough to be able to compile.
SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
SDLoc DL(N);
+ const MachineFunction &MF = CurDAG->getMachineFunction();
+ DiagnosticInfoUnsupported NotImplemented(*MF.getFunction(),
+ "addrspacecast not implemented");
+ CurDAG->getContext()->diagnose(NotImplemented);
+
assert(Subtarget->hasFlatAddressSpace() &&
"addrspacecast only supported with flat address space!");
- assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
- ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) &&
- "Cannot cast address space to / from constant address!");
-
assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
"Can only cast to / from flat address space!");
@@ -1190,7 +1251,6 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32));
}
-
if (DestSize > SrcSize) {
assert(SrcSize == 32 && DestSize == 64);
@@ -1371,6 +1431,65 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
return SelectVOP3Mods(In, Src, SrcMods);
}
+void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
+ bool Modified = false;
+
+ // XXX - Other targets seem to be able to do this without a worklist.
+ SmallVector<LoadSDNode *, 8> LoadsToReplace;
+ SmallVector<StoreSDNode *, 8> StoresToReplace;
+
+ for (SDNode &Node : CurDAG->allnodes()) {
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(&Node)) {
+ EVT VT = LD->getValueType(0);
+ if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD)
+ continue;
+
+ // To simplify the TableGen patters, we replace all i64 loads with v2i32
+ // loads. Alternatively, we could promote i64 loads to v2i32 during DAG
+ // legalization, however, so places (ExpandUnalignedLoad) in the DAG
+ // legalizer assume that if i64 is legal, so doing this promotion early
+ // can cause problems.
+ LoadsToReplace.push_back(LD);
+ } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(&Node)) {
+ // Handle i64 stores here for the same reason mentioned above for loads.
+ SDValue Value = ST->getValue();
+ if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore())
+ continue;
+ StoresToReplace.push_back(ST);
+ }
+ }
+
+ for (LoadSDNode *LD : LoadsToReplace) {
+ SDLoc SL(LD);
+
+ SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SL, LD->getChain(),
+ LD->getBasePtr(), LD->getMemOperand());
+ SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL,
+ MVT::i64, NewLoad);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 0), BitCast);
+ Modified = true;
+ }
+
+ for (StoreSDNode *ST : StoresToReplace) {
+ SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(ST),
+ MVT::v2i32, ST->getValue());
+ const SDValue StoreOps[] = {
+ ST->getChain(),
+ NewValue,
+ ST->getBasePtr(),
+ ST->getOffset()
+ };
+
+ CurDAG->UpdateNodeOperands(ST, StoreOps);
+ Modified = true;
+ }
+
+ // XXX - Is this necessary?
+ if (Modified)
+ CurDAG->RemoveDeadNodes();
+}
+
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
const AMDGPUTargetLowering& Lowering =
*static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3a65f3b..222f631 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -15,6 +15,7 @@
#include "AMDGPUISelLowering.h"
#include "AMDGPU.h"
+#include "AMDGPUDiagnosticInfoUnsupported.h"
#include "AMDGPUFrameLowering.h"
#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPURegisterInfo.h"
@@ -27,50 +28,9 @@
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/DiagnosticPrinter.h"
using namespace llvm;
-namespace {
-
-/// Diagnostic information for unimplemented or unsupported feature reporting.
-class DiagnosticInfoUnsupported : public DiagnosticInfo {
-private:
- const Twine &Description;
- const Function &Fn;
-
- static int KindID;
-
- static int getKindID() {
- if (KindID == 0)
- KindID = llvm::getNextAvailablePluginDiagnosticKind();
- return KindID;
- }
-
-public:
- DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
- DiagnosticSeverity Severity = DS_Error)
- : DiagnosticInfo(getKindID(), Severity),
- Description(Desc),
- Fn(Fn) { }
-
- const Function &getFunction() const { return Fn; }
- const Twine &getDescription() const { return Description; }
-
- void print(DiagnosticPrinter &DP) const override {
- DP << "unsupported " << getDescription() << " in " << Fn.getName();
- }
-
- static bool classof(const DiagnosticInfo *DI) {
- return DI->getKind() == getKindID();
- }
-};
-
-int DiagnosticInfoUnsupported::KindID = 0;
-}
-
-
static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State) {
@@ -113,6 +73,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
setOperationAction(ISD::BRIND, MVT::Other, Expand);
+ // This is totally unsupported, just custom lower to produce an error.
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+
// We need to custom lower some of the intrinsics
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -352,7 +315,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Custom);
- setOperationAction(ISD::UDIVREM, VT, Custom);
+ setOperationAction(ISD::UDIVREM, VT, Expand);
setOperationAction(ISD::ADDC, VT, Expand);
setOperationAction(ISD::SUBC, VT, Expand);
setOperationAction(ISD::ADDE, VT, Expand);
@@ -429,12 +392,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
setSelectIsExpensive(false);
PredictableSelectIsExpensive = false;
- // There are no integer divide instructions, and these expand to a pretty
- // large sequence of instructions.
- setIntDivIsCheap(false);
- setPow2SDivIsCheap(false);
setFsqrtIsCheap(true);
+ // We want to find all load dependencies for long chains of stores to enable
+ // merging into very wide vectors. The problem is with vectors with > 4
+ // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
+ // vectors are a legal type, even though we have to split the loads
+ // usually. When we can more precisely specify load legality per address
+ // space, we should be able to make FindBetterChain/MergeConsecutiveStores
+ // smarter so that they can figure out what to do in 2 iterations without all
+ // N > 4 stores on the same chain.
+ GatherAllAliasesMaxDepth = 16;
+
// FIXME: Need to really handle these.
MaxStoresPerMemcpy = 4096;
MaxStoresPerMemmove = 4096;
@@ -534,6 +503,18 @@ bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
return true;
}
+bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
+ // There are few operations which truly have vector input operands. Any vector
+ // operation is going to involve operations on each component, and a
+ // build_vector will be a copy per element, so it always makes sense to use a
+ // build_vector input in place of the extracted element to avoid a copy into a
+ // super register.
+ //
+ // We should probably only do this if all users are extracts only, but this
+ // should be the common case.
+ return true;
+}
+
bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
// Truncate is just accessing a subregister.
return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
@@ -617,6 +598,15 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
return SDValue();
}
+SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ const Function &Fn = *DAG.getMachineFunction().getFunction();
+
+ DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "dynamic alloca");
+ DAG.getContext()->diagnose(NoDynamicAlloca);
+ return SDValue();
+}
+
SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -643,6 +633,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
}
return Op;
}
@@ -892,7 +883,9 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
unsigned FrameIndex = FIN->getIndex();
- unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
+ unsigned IgnoredFrameReg;
+ unsigned Offset =
+ TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
Op.getValueType());
}
@@ -1043,9 +1036,6 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(1),
Op.getOperand(2));
- case AMDGPUIntrinsic::AMDGPU_brev:
- return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));
-
case Intrinsic::AMDGPU_class:
return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
Op.getOperand(1), Op.getOperand(2));
@@ -1057,6 +1047,8 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.
return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));
+ case AMDGPUIntrinsic::AMDGPU_brev: // Legacy name
+ return DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(1));
}
}
@@ -1077,6 +1069,7 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT VT = Op.getValueType();
+ // TODO: Should this propagate fast-math-flags?
SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
DAG.getConstantFP(1.0f, DL, MVT::f32),
Op.getOperand(1));
@@ -1167,45 +1160,6 @@ SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
return SDValue();
}
-// FIXME: Remove this when combines added to DAGCombiner.
-SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL,
- EVT VT,
- SDValue LHS,
- SDValue RHS,
- SDValue True,
- SDValue False,
- SDValue CC,
- SelectionDAG &DAG) const {
- if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
- return SDValue();
-
- ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
- switch (CCOpcode) {
- case ISD::SETULE:
- case ISD::SETULT: {
- unsigned Opc = (LHS == True) ? ISD::UMIN : ISD::UMAX;
- return DAG.getNode(Opc, DL, VT, LHS, RHS);
- }
- case ISD::SETLE:
- case ISD::SETLT: {
- unsigned Opc = (LHS == True) ? ISD::SMIN : ISD::SMAX;
- return DAG.getNode(Opc, DL, VT, LHS, RHS);
- }
- case ISD::SETGT:
- case ISD::SETGE: {
- unsigned Opc = (LHS == True) ? ISD::SMAX : ISD::SMIN;
- return DAG.getNode(Opc, DL, VT, LHS, RHS);
- }
- case ISD::SETUGE:
- case ISD::SETUGT: {
- unsigned Opc = (LHS == True) ? ISD::UMAX : ISD::UMIN;
- return DAG.getNode(Opc, DL, VT, LHS, RHS);
- }
- default:
- return SDValue();
- }
-}
-
SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op,
SelectionDAG &DAG) const {
LoadSDNode *Load = cast<LoadSDNode>(Op);
@@ -1260,7 +1214,8 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
EVT PtrVT = BasePtr.getValueType();
EVT MemVT = Load->getMemoryVT();
SDLoc SL(Op);
- MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
+
+ const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
EVT LoVT, HiVT;
EVT LoMemVT, HiMemVT;
@@ -1269,23 +1224,27 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
+
+ unsigned Size = LoMemVT.getStoreSize();
+ unsigned BaseAlign = Load->getAlignment();
+ unsigned HiAlign = MinAlign(BaseAlign, Size);
+
SDValue LoLoad
= DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
Load->getChain(), BasePtr,
SrcValue,
LoMemVT, Load->isVolatile(), Load->isNonTemporal(),
- Load->isInvariant(), Load->getAlignment());
+ Load->isInvariant(), BaseAlign);
SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
- DAG.getConstant(LoMemVT.getStoreSize(), SL,
- PtrVT));
+ DAG.getConstant(Size, SL, PtrVT));
SDValue HiLoad
= DAG.getExtLoad(Load->getExtensionType(), SL, HiVT,
Load->getChain(), HiPtr,
SrcValue.getWithOffset(LoMemVT.getStoreSize()),
HiMemVT, Load->isVolatile(), Load->isNonTemporal(),
- Load->isInvariant(), Load->getAlignment());
+ Load->isInvariant(), HiAlign);
SDValue Ops[] = {
DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
@@ -1415,7 +1374,11 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
DAG.getConstant(LoMemVT.getStoreSize(), SL,
PtrVT));
- MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
+ const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
+ unsigned BaseAlign = Store->getAlignment();
+ unsigned Size = LoMemVT.getStoreSize();
+ unsigned HiAlign = MinAlign(BaseAlign, Size);
+
SDValue LoStore
= DAG.getTruncStore(Chain, SL, Lo,
BasePtr,
@@ -1423,15 +1386,15 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
LoMemVT,
Store->isNonTemporal(),
Store->isVolatile(),
- Store->getAlignment());
+ BaseAlign);
SDValue HiStore
= DAG.getTruncStore(Chain, SL, Hi,
HiPtr,
- SrcValue.getWithOffset(LoMemVT.getStoreSize()),
+ SrcValue.getWithOffset(Size),
HiMemVT,
Store->isNonTemporal(),
Store->isVolatile(),
- Store->getAlignment());
+ HiAlign);
return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
}
@@ -1529,7 +1492,7 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
Store->getValue().getValueType().isVector()) {
- return ScalarizeVectorStore(Op, DAG);
+ return SplitVectorStore(Op, DAG);
}
EVT MemVT = Store->getMemoryVT();
@@ -1630,6 +1593,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
// float fb = (float)ib;
SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
+ // TODO: Should this propagate fast-math-flags?
// float fq = native_divide(fa, fb);
SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
@@ -1940,6 +1904,8 @@ SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
SDValue X = Op.getOperand(0);
SDValue Y = Op.getOperand(1);
+ // TODO: Should this propagate fast-math-flags?
+
SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
@@ -1968,6 +1934,7 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
+ // TODO: Should this propagate fast-math-flags?
return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
}
@@ -2045,6 +2012,8 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
+ // TODO: Should this propagate fast-math-flags?
+
SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
@@ -2074,6 +2043,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const
SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
+ // TODO: Should this propagate fast-math-flags?
+
SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
@@ -2184,6 +2155,7 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
+ // TODO: Should this propagate fast-math-flags?
return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
}
@@ -2206,7 +2178,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
DAG.getConstant(32, SL, MVT::i32));
-
+ // TODO: Should this propagate fast-math-flags?
return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
}
@@ -2231,6 +2203,7 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
DAG.getConstant(1, DL, MVT::i32));
SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi);
+ // TODO: Should this propagate fast-math-flags?
FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32
return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
@@ -2257,7 +2230,7 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
MVT::f64);
SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
MVT::f64);
-
+ // TODO: Should this propagate fast-math-flags?
SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
@@ -2511,12 +2484,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
if (VT == MVT::f32)
return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
-
- // TODO: Implement min / max Evergreen instructions.
- if (VT == MVT::i32 &&
- Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
- return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
- }
}
break;
@@ -2652,20 +2619,14 @@ bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
return CFP->isExactlyValue(1.0);
}
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
- return C->isAllOnesValue();
- }
- return false;
+ return isAllOnesConstant(Op);
}
bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
return CFP->getValueAPF().isZero();
}
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
- return C->isNullValue();
- }
- return false;
+ return isNullConstant(Op);
}
SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
@@ -2738,7 +2699,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BFE_I32)
NODE_NAME_CASE(BFI)
NODE_NAME_CASE(BFM)
- NODE_NAME_CASE(BREV)
NODE_NAME_CASE(MUL_U24)
NODE_NAME_CASE(MUL_I24)
NODE_NAME_CASE(MAD_U24)
@@ -2893,8 +2853,7 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
return 1;
unsigned SignBits = 32 - Width->getZExtValue() + 1;
- ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1));
- if (!Offset || !Offset->isNullValue())
+ if (!isNullConstant(Op.getOperand(1)))
return SignBits;
// TODO: Could probably figure something out with non-0 offsets.
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 478b203..7314cc0 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -138,6 +138,7 @@ public:
bool storeOfVectorConstantIsCheap(EVT MemVT,
unsigned NumElem,
unsigned AS) const override;
+ bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override;
bool isCheapToSpeculateCttz() const override;
bool isCheapToSpeculateCtlz() const override;
@@ -149,6 +150,9 @@ public:
SDValue LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const;
+
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
void ReplaceNodeResults(SDNode * N,
@@ -165,14 +169,6 @@ public:
SDValue False,
SDValue CC,
DAGCombinerInfo &DCI) const;
- SDValue CombineIMinMax(SDLoc DL,
- EVT VT,
- SDValue LHS,
- SDValue RHS,
- SDValue True,
- SDValue False,
- SDValue CC,
- SelectionDAG &DAG) const;
const char* getTargetNodeName(unsigned Opcode) const override;
@@ -216,7 +212,7 @@ public:
/// \brief Helper function that returns the byte offset of the given
/// type of implicit parameter.
- unsigned getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
+ uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
const ImplicitParameter Param) const;
};
@@ -267,7 +263,6 @@ enum NodeType : unsigned {
BFE_I32, // Extract range of bits with sign extension to 32-bits.
BFI, // (src0 & src1) | (~src0 & src2)
BFM, // Insert a range of bits into a 32-bit word.
- BREV, // Reverse bits.
MUL_U24,
MUL_I24,
MAD_U24,
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 15a3d54..a266e71 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -164,11 +164,6 @@ MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl(
// TODO: Implement this function
return nullptr;
}
-bool AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
- ArrayRef<unsigned> Ops) const {
- // TODO: Implement this function
- return false;
-}
bool
AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
unsigned Reg, bool UnfoldLoad,
@@ -312,7 +307,9 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
return -1;
}
- Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexOffset(MF, -1);
+ unsigned IgnoredFrameReg;
+ Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexReference(
+ MF, -1, IgnoredFrameReg);
return getIndirectIndexBegin(MF) + Offset;
}
@@ -367,3 +364,14 @@ int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
return MCOp;
}
+
+ArrayRef<std::pair<int, const char *>>
+AMDGPUInstrInfo::getSerializableTargetIndices() const {
+ static const std::pair<int, const char *> TargetIndices[] = {
+ {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
+ {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
+ {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
+ {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
+ {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
+ return makeArrayRef(TargetIndices);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 86d3962..53e8b23 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -103,8 +103,6 @@ public:
/// read or write or -1 if indirect addressing is not used by this program.
int getIndirectIndexEnd(const MachineFunction &MF) const;
- bool canFoldMemoryOperand(const MachineInstr *MI,
- ArrayRef<unsigned> Ops) const override;
bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
SmallVectorImpl<MachineInstr *> &NewMIs) const override;
@@ -147,6 +145,9 @@ public:
return get(pseudoToMCOpcode(Opcode));
}
+ ArrayRef<std::pair<int, const char *>>
+ getSerializableTargetIndices() const override;
+
//===---------------------------------------------------------------------===//
// Pure virtual funtions to be implemented by sub-classes.
//===---------------------------------------------------------------------===//
@@ -195,6 +196,7 @@ public:
};
namespace AMDGPU {
+ LLVM_READONLY
int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex);
} // End namespace AMDGPU
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index b413897..70e589c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -191,8 +191,6 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
-def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>;
-
// Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when
// performing the mulitply. The result is a 32-bit value.
def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 72cab39..11f6139 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -514,7 +514,7 @@ class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul>
class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx,
SubRegIndex sub_reg>
: Pat<
- (sub_type (vector_extract vec_type:$src, sub_idx)),
+ (sub_type (extractelt vec_type:$src, sub_idx)),
(EXTRACT_SUBREG $src, sub_reg)
>;
@@ -522,7 +522,7 @@ class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx,
class Insert_Element <ValueType elem_type, ValueType vec_type,
int sub_idx, SubRegIndex sub_reg>
: Pat <
- (vector_insert vec_type:$vec, elem_type:$elem, sub_idx),
+ (insertelt vec_type:$vec, elem_type:$elem, sub_idx),
(INSERT_SUBREG $vec, $elem, sub_reg)
>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
index ab489cd..1de3546 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
@@ -69,8 +69,8 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
- def int_AMDGPU_barrier_local : Intrinsic<[], [], []>;
- def int_AMDGPU_barrier_global : Intrinsic<[], [], []>;
+ def int_AMDGPU_barrier_local : Intrinsic<[], [], [IntrConvergent]>;
+ def int_AMDGPU_barrier_global : Intrinsic<[], [], [IntrConvergent]>;
}
// Legacy names for compatibility.
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 2083146..dfc652f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -61,7 +61,7 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
MCOp = MCOperand::createImm(MO.getImm());
break;
case MachineOperand::MO_Register:
- MCOp = MCOperand::createReg(MO.getReg());
+ MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST));
break;
case MachineOperand::MO_MachineBasicBlock:
MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
@@ -73,13 +73,6 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx));
break;
}
- case MachineOperand::MO_TargetIndex: {
- assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START);
- MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
- const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
- MCOp = MCOperand::createExpr(Expr);
- break;
- }
case MachineOperand::MO_ExternalSymbol: {
MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName()));
const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx);
@@ -104,10 +97,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
#endif
if (MI->isBundle()) {
const MachineBasicBlock *MBB = MI->getParent();
- MachineBasicBlock::const_instr_iterator I = MI;
- ++I;
- while (I != MBB->end() && I->isInsideBundle()) {
- EmitInstruction(I);
+ MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
+ while (I != MBB->instr_end() && I->isInsideBundle()) {
+ EmitInstruction(&*I);
++I;
}
} else {
@@ -136,8 +128,6 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter();
InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups,
MF->getSubtarget<MCSubtargetInfo>());
- CodeStream.flush();
-
HexLines.resize(HexLines.size() + 1);
std::string &HexLine = HexLines.back();
raw_string_ostream HexStream(HexLine);
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 21c7da6..5413717 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -1,11 +1,10 @@
#include "AMDGPUMachineFunction.h"
#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Function.h"
using namespace llvm;
-static const char *const ShaderTypeAttribute = "ShaderType";
-
// Pin the vtable to this file.
void AMDGPUMachineFunction::anchor() {}
@@ -13,13 +12,9 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
MachineFunctionInfo(),
ShaderType(ShaderType::COMPUTE),
LDSSize(0),
+ ABIArgOffset(0),
ScratchSize(0),
IsKernel(true) {
- Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute);
- if (A.isStringAttribute()) {
- StringRef Str = A.getValueAsString();
- if (Str.getAsInteger(0, ShaderType))
- llvm_unreachable("Can't parse shader type!");
- }
+ ShaderType = AMDGPU::getShaderType(*MF.getFunction());
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index f5e4694..46fcee8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -37,6 +37,11 @@ public:
return ShaderType;
}
+ bool isKernel() const {
+ // FIXME: Assume everything is a kernel until function calls are supported.
+ return true;
+ }
+
unsigned ScratchSize;
bool IsKernel;
};
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
new file mode 100644
index 0000000..554bf1d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp
@@ -0,0 +1,373 @@
+//===-- AMDGPUOpenCLImageTypeLoweringPass.cpp -----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass resolves calls to OpenCL image attribute, image resource ID and
+/// sampler resource ID getter functions.
+///
+/// Image attributes (size and format) are expected to be passed to the kernel
+/// as kernel arguments immediately following the image argument itself,
+/// therefore this pass adds image size and format arguments to the kernel
+/// functions in the module. The kernel functions with image arguments are
+/// re-created using the new signature. The new arguments are added to the
+/// kernel metadata with kernel_arg_type set to "image_size" or "image_format".
+/// Note: this pass may invalidate pointers to functions.
+///
+/// Resource IDs of read-only images, write-only images and samplers are
+/// defined to be their index among the kernel arguments of the same
+/// type and access qualifier.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+namespace {
+
+StringRef GetImageSizeFunc = "llvm.OpenCL.image.get.size";
+StringRef GetImageFormatFunc = "llvm.OpenCL.image.get.format";
+StringRef GetImageResourceIDFunc = "llvm.OpenCL.image.get.resource.id";
+StringRef GetSamplerResourceIDFunc = "llvm.OpenCL.sampler.get.resource.id";
+
+StringRef ImageSizeArgMDType = "__llvm_image_size";
+StringRef ImageFormatArgMDType = "__llvm_image_format";
+
+StringRef KernelsMDNodeName = "opencl.kernels";
+StringRef KernelArgMDNodeNames[] = {
+ "kernel_arg_addr_space",
+ "kernel_arg_access_qual",
+ "kernel_arg_type",
+ "kernel_arg_base_type",
+ "kernel_arg_type_qual"};
+const unsigned NumKernelArgMDNodes = 5;
+
+typedef SmallVector<Metadata *, 8> MDVector;
+struct KernelArgMD {
+ MDVector ArgVector[NumKernelArgMDNodes];
+};
+
+} // end anonymous namespace
+
+static inline bool
+IsImageType(StringRef TypeString) {
+ return TypeString == "image2d_t" || TypeString == "image3d_t";
+}
+
+static inline bool
+IsSamplerType(StringRef TypeString) {
+ return TypeString == "sampler_t";
+}
+
+static Function *
+GetFunctionFromMDNode(MDNode *Node) {
+ if (!Node)
+ return nullptr;
+
+ size_t NumOps = Node->getNumOperands();
+ if (NumOps != NumKernelArgMDNodes + 1)
+ return nullptr;
+
+ auto F = mdconst::dyn_extract<Function>(Node->getOperand(0));
+ if (!F)
+ return nullptr;
+
+ // Sanity checks.
+ size_t ExpectNumArgNodeOps = F->arg_size() + 1;
+ for (size_t i = 0; i < NumKernelArgMDNodes; ++i) {
+ MDNode *ArgNode = dyn_cast_or_null<MDNode>(Node->getOperand(i + 1));
+ if (ArgNode->getNumOperands() != ExpectNumArgNodeOps)
+ return nullptr;
+ if (!ArgNode->getOperand(0))
+ return nullptr;
+
+ // FIXME: It should be possible to do image lowering when some metadata
+ // args missing or not in the expected order.
+ MDString *StringNode = dyn_cast<MDString>(ArgNode->getOperand(0));
+ if (!StringNode || StringNode->getString() != KernelArgMDNodeNames[i])
+ return nullptr;
+ }
+
+ return F;
+}
+
+static StringRef
+AccessQualFromMD(MDNode *KernelMDNode, unsigned ArgIdx) {
+ MDNode *ArgAQNode = cast<MDNode>(KernelMDNode->getOperand(2));
+ return cast<MDString>(ArgAQNode->getOperand(ArgIdx + 1))->getString();
+}
+
+static StringRef
+ArgTypeFromMD(MDNode *KernelMDNode, unsigned ArgIdx) {
+ MDNode *ArgTypeNode = cast<MDNode>(KernelMDNode->getOperand(3));
+ return cast<MDString>(ArgTypeNode->getOperand(ArgIdx + 1))->getString();
+}
+
+static MDVector
+GetArgMD(MDNode *KernelMDNode, unsigned OpIdx) {
+ MDVector Res;
+ for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) {
+ MDNode *Node = cast<MDNode>(KernelMDNode->getOperand(i + 1));
+ Res.push_back(Node->getOperand(OpIdx));
+ }
+ return Res;
+}
+
+static void
+PushArgMD(KernelArgMD &MD, const MDVector &V) {
+ assert(V.size() == NumKernelArgMDNodes);
+ for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) {
+ MD.ArgVector[i].push_back(V[i]);
+ }
+}
+
+namespace {
+
+class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass {
+ static char ID;
+
+ LLVMContext *Context;
+ Type *Int32Type;
+ Type *ImageSizeType;
+ Type *ImageFormatType;
+ SmallVector<Instruction *, 4> InstsToErase;
+
+ bool replaceImageUses(Argument &ImageArg, uint32_t ResourceID,
+ Argument &ImageSizeArg,
+ Argument &ImageFormatArg) {
+ bool Modified = false;
+
+ for (auto &Use : ImageArg.uses()) {
+ auto Inst = dyn_cast<CallInst>(Use.getUser());
+ if (!Inst) {
+ continue;
+ }
+
+ Function *F = Inst->getCalledFunction();
+ if (!F)
+ continue;
+
+ Value *Replacement = nullptr;
+ StringRef Name = F->getName();
+ if (Name.startswith(GetImageResourceIDFunc)) {
+ Replacement = ConstantInt::get(Int32Type, ResourceID);
+ } else if (Name.startswith(GetImageSizeFunc)) {
+ Replacement = &ImageSizeArg;
+ } else if (Name.startswith(GetImageFormatFunc)) {
+ Replacement = &ImageFormatArg;
+ } else {
+ continue;
+ }
+
+ Inst->replaceAllUsesWith(Replacement);
+ InstsToErase.push_back(Inst);
+ Modified = true;
+ }
+
+ return Modified;
+ }
+
+ bool replaceSamplerUses(Argument &SamplerArg, uint32_t ResourceID) {
+ bool Modified = false;
+
+ for (const auto &Use : SamplerArg.uses()) {
+ auto Inst = dyn_cast<CallInst>(Use.getUser());
+ if (!Inst) {
+ continue;
+ }
+
+ Function *F = Inst->getCalledFunction();
+ if (!F)
+ continue;
+
+ Value *Replacement = nullptr;
+ StringRef Name = F->getName();
+ if (Name == GetSamplerResourceIDFunc) {
+ Replacement = ConstantInt::get(Int32Type, ResourceID);
+ } else {
+ continue;
+ }
+
+ Inst->replaceAllUsesWith(Replacement);
+ InstsToErase.push_back(Inst);
+ Modified = true;
+ }
+
+ return Modified;
+ }
+
+ bool replaceImageAndSamplerUses(Function *F, MDNode *KernelMDNode) {
+ uint32_t NumReadOnlyImageArgs = 0;
+ uint32_t NumWriteOnlyImageArgs = 0;
+ uint32_t NumSamplerArgs = 0;
+
+ bool Modified = false;
+ InstsToErase.clear();
+ for (auto ArgI = F->arg_begin(); ArgI != F->arg_end(); ++ArgI) {
+ Argument &Arg = *ArgI;
+ StringRef Type = ArgTypeFromMD(KernelMDNode, Arg.getArgNo());
+
+ // Handle image types.
+ if (IsImageType(Type)) {
+ StringRef AccessQual = AccessQualFromMD(KernelMDNode, Arg.getArgNo());
+ uint32_t ResourceID;
+ if (AccessQual == "read_only") {
+ ResourceID = NumReadOnlyImageArgs++;
+ } else if (AccessQual == "write_only") {
+ ResourceID = NumWriteOnlyImageArgs++;
+ } else {
+ llvm_unreachable("Wrong image access qualifier.");
+ }
+
+ Argument &SizeArg = *(++ArgI);
+ Argument &FormatArg = *(++ArgI);
+ Modified |= replaceImageUses(Arg, ResourceID, SizeArg, FormatArg);
+
+ // Handle sampler type.
+ } else if (IsSamplerType(Type)) {
+ uint32_t ResourceID = NumSamplerArgs++;
+ Modified |= replaceSamplerUses(Arg, ResourceID);
+ }
+ }
+ for (unsigned i = 0; i < InstsToErase.size(); ++i) {
+ InstsToErase[i]->eraseFromParent();
+ }
+
+ return Modified;
+ }
+
+ std::tuple<Function *, MDNode *>
+ addImplicitArgs(Function *F, MDNode *KernelMDNode) {
+ bool Modified = false;
+
+ FunctionType *FT = F->getFunctionType();
+ SmallVector<Type *, 8> ArgTypes;
+
+ // Metadata operands for new MDNode.
+ KernelArgMD NewArgMDs;
+ PushArgMD(NewArgMDs, GetArgMD(KernelMDNode, 0));
+
+ // Add implicit arguments to the signature.
+ for (unsigned i = 0; i < FT->getNumParams(); ++i) {
+ ArgTypes.push_back(FT->getParamType(i));
+ MDVector ArgMD = GetArgMD(KernelMDNode, i + 1);
+ PushArgMD(NewArgMDs, ArgMD);
+
+ if (!IsImageType(ArgTypeFromMD(KernelMDNode, i)))
+ continue;
+
+ // Add size implicit argument.
+ ArgTypes.push_back(ImageSizeType);
+ ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageSizeArgMDType);
+ PushArgMD(NewArgMDs, ArgMD);
+
+ // Add format implicit argument.
+ ArgTypes.push_back(ImageFormatType);
+ ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageFormatArgMDType);
+ PushArgMD(NewArgMDs, ArgMD);
+
+ Modified = true;
+ }
+ if (!Modified) {
+ return std::make_tuple(nullptr, nullptr);
+ }
+
+ // Create function with new signature and clone the old body into it.
+ auto NewFT = FunctionType::get(FT->getReturnType(), ArgTypes, false);
+ auto NewF = Function::Create(NewFT, F->getLinkage(), F->getName());
+ ValueToValueMapTy VMap;
+ auto NewFArgIt = NewF->arg_begin();
+ for (auto &Arg: F->args()) {
+ auto ArgName = Arg.getName();
+ NewFArgIt->setName(ArgName);
+ VMap[&Arg] = &(*NewFArgIt++);
+ if (IsImageType(ArgTypeFromMD(KernelMDNode, Arg.getArgNo()))) {
+ (NewFArgIt++)->setName(Twine("__size_") + ArgName);
+ (NewFArgIt++)->setName(Twine("__format_") + ArgName);
+ }
+ }
+ SmallVector<ReturnInst*, 8> Returns;
+ CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns);
+
+ // Build new MDNode.
+ SmallVector<llvm::Metadata *, 6> KernelMDArgs;
+ KernelMDArgs.push_back(ConstantAsMetadata::get(NewF));
+ for (unsigned i = 0; i < NumKernelArgMDNodes; ++i)
+ KernelMDArgs.push_back(MDNode::get(*Context, NewArgMDs.ArgVector[i]));
+ MDNode *NewMDNode = MDNode::get(*Context, KernelMDArgs);
+
+ return std::make_tuple(NewF, NewMDNode);
+ }
+
+ bool transformKernels(Module &M) {
+ NamedMDNode *KernelsMDNode = M.getNamedMetadata(KernelsMDNodeName);
+ if (!KernelsMDNode)
+ return false;
+
+ bool Modified = false;
+ for (unsigned i = 0; i < KernelsMDNode->getNumOperands(); ++i) {
+ MDNode *KernelMDNode = KernelsMDNode->getOperand(i);
+ Function *F = GetFunctionFromMDNode(KernelMDNode);
+ if (!F)
+ continue;
+
+ Function *NewF;
+ MDNode *NewMDNode;
+ std::tie(NewF, NewMDNode) = addImplicitArgs(F, KernelMDNode);
+ if (NewF) {
+ // Replace old function and metadata with new ones.
+ F->eraseFromParent();
+ M.getFunctionList().push_back(NewF);
+ M.getOrInsertFunction(NewF->getName(), NewF->getFunctionType(),
+ NewF->getAttributes());
+ KernelsMDNode->setOperand(i, NewMDNode);
+
+ F = NewF;
+ KernelMDNode = NewMDNode;
+ Modified = true;
+ }
+
+ Modified |= replaceImageAndSamplerUses(F, KernelMDNode);
+ }
+
+ return Modified;
+ }
+
+ public:
+ AMDGPUOpenCLImageTypeLoweringPass() : ModulePass(ID) {}
+
+ bool runOnModule(Module &M) override {
+ Context = &M.getContext();
+ Int32Type = Type::getInt32Ty(M.getContext());
+ ImageSizeType = ArrayType::get(Int32Type, 3);
+ ImageFormatType = ArrayType::get(Int32Type, 2);
+
+ return transformKernels(M);
+ }
+
+ const char *getPassName() const override {
+ return "AMDGPU OpenCL Image Type Pass";
+ }
+};
+
+char AMDGPUOpenCLImageTypeLoweringPass::ID = 0;
+
+} // end anonymous namespace
+
+ModulePass *llvm::createAMDGPUOpenCLImageTypeLoweringPass() {
+ return new AMDGPUOpenCLImageTypeLoweringPass();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 57b7a73..87d50d5 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -54,7 +54,7 @@ bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
- const FunctionType *FTy = F.getFunctionType();
+ FunctionType *FTy = F.getFunctionType();
LocalMemAvailable = ST.getLocalMemorySize();
@@ -63,7 +63,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
// possible these arguments require the entire local memory space, so
// we cannot use local memory in the pass.
for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
- const Type *ParamTy = FTy->getParamType(i);
+ Type *ParamTy = FTy->getParamType(i);
if (ParamTy->isPointerTy() &&
ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
LocalMemAvailable = 0;
@@ -77,7 +77,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
// Check how much local memory is being used by global objects
for (Module::global_iterator I = Mod->global_begin(),
E = Mod->global_end(); I != E; ++I) {
- GlobalVariable *GV = I;
+ GlobalVariable *GV = &*I;
PointerType *GVTy = GV->getType();
if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
continue;
@@ -101,7 +101,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
return false;
}
-static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
+static VectorType *arrayTypeToVecType(Type *ArrayTy) {
return VectorType::get(ArrayTy->getArrayElementType(),
ArrayTy->getArrayNumElements());
}
@@ -276,6 +276,9 @@ static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
}
void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
+ if (!I.isStaticAlloca())
+ return;
+
IRBuilder<> Builder(&I);
// First try to replace the alloca with a vector
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index cfd800b..0344834 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -37,10 +37,6 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
assert(!"Unimplemented"); return BitVector();
}
- virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const {
- assert(!"Unimplemented"); return nullptr;
- }
-
virtual unsigned getHWRegIndex(unsigned Reg) const {
assert(!"Unimplemented"); return 0;
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 5f32a65..44e0c47 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -16,6 +16,7 @@
#include "R600ISelLowering.h"
#include "R600InstrInfo.h"
#include "R600MachineScheduler.h"
+#include "SIFrameLowering.h"
#include "SIISelLowering.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
@@ -44,6 +45,8 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
// disable it.
SmallString<256> FullFS("+promote-alloca,+fp64-denormals,");
+ if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
+ FullFS += "+flat-for-global,";
FullFS += FS;
if (GPU == "" && TT.getArch() == Triple::amdgcn)
@@ -67,26 +70,36 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
DumpCode(false), R600ALUInst(false), HasVertexCache(false),
TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false),
FP64Denormals(false), FP32Denormals(false), FastFMAF32(false),
- CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true),
- EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false),
- EnableUnsafeDSOffsetFolding(false),
+ CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false),
+ EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true),
+ EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false),
WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false),
- FrameLowering(TargetFrameLowering::StackGrowsUp,
- 64 * 16, // Maximum stack alignment (long16)
- 0),
+ FrameLowering(nullptr),
InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) {
initializeSubtargetDependencies(TT, GPU, FS);
+ const unsigned MaxStackAlign = 64 * 16; // Maximum stack alignment (long16)
+
if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
InstrInfo.reset(new R600InstrInfo(*this));
TLInfo.reset(new R600TargetLowering(TM, *this));
+
+ // FIXME: Should have R600 specific FrameLowering
+ FrameLowering.reset(new AMDGPUFrameLowering(
+ TargetFrameLowering::StackGrowsUp,
+ MaxStackAlign,
+ 0));
} else {
InstrInfo.reset(new SIInstrInfo(*this));
TLInfo.reset(new SITargetLowering(TM, *this));
+ FrameLowering.reset(new SIFrameLowering(
+ TargetFrameLowering::StackGrowsUp,
+ MaxStackAlign,
+ 0));
}
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 735f01d..9c7bb88 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -1,4 +1,4 @@
-//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====//
+//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====//
//
// The LLVM Compiler Infrastructure
//
@@ -12,17 +12,15 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H
-#define LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
+
#include "AMDGPU.h"
#include "AMDGPUFrameLowering.h"
#include "AMDGPUInstrInfo.h"
-#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUISelLowering.h"
#include "AMDGPUSubtarget.h"
-#include "R600ISelLowering.h"
-#include "AMDKernelCodeT.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Target/TargetSubtargetInfo.h"
@@ -72,6 +70,7 @@ private:
bool FastFMAF32;
bool CaymanISA;
bool FlatAddressSpace;
+ bool FlatForGlobal;
bool EnableIRStructurizer;
bool EnablePromoteAlloca;
bool EnableIfCvt;
@@ -88,10 +87,10 @@ private:
bool CIInsts;
bool FeatureDisable;
int LDSBankCount;
- unsigned IsaVersion;
+ unsigned IsaVersion;
bool EnableHugeScratchBuffer;
- AMDGPUFrameLowering FrameLowering;
+ std::unique_ptr<AMDGPUFrameLowering> FrameLowering;
std::unique_ptr<AMDGPUTargetLowering> TLInfo;
std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
InstrItineraryData InstrItins;
@@ -104,7 +103,7 @@ public:
StringRef GPU, StringRef FS);
const AMDGPUFrameLowering *getFrameLowering() const override {
- return &FrameLowering;
+ return FrameLowering.get();
}
const AMDGPUInstrInfo *getInstrInfo() const override {
return InstrInfo.get();
@@ -161,6 +160,10 @@ public:
return FlatAddressSpace;
}
+ bool useFlatForGlobal() const {
+ return FlatForGlobal;
+ }
+
bool hasBFE() const {
return (getGeneration() >= EVERGREEN);
}
@@ -305,6 +308,9 @@ public:
return isAmdHsaOS() ? 0 : 36;
}
+ unsigned getMaxNumUserSGPRs() const {
+ return 16;
+ }
};
} // End namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 2297b52..22f85b3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -14,6 +14,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUTargetMachine.h"
+#include "AMDGPUTargetObjectFile.h"
#include "AMDGPU.h"
#include "AMDGPUTargetTransformInfo.h"
#include "R600ISelLowering.h"
@@ -41,6 +42,23 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget);
RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget);
+
+ PassRegistry *PR = PassRegistry::getPassRegistry();
+ initializeSILowerI1CopiesPass(*PR);
+ initializeSIFixSGPRCopiesPass(*PR);
+ initializeSIFoldOperandsPass(*PR);
+ initializeSIFixSGPRLiveRangesPass(*PR);
+ initializeSIFixControlFlowLiveIntervalsPass(*PR);
+ initializeSILoadStoreOptimizerPass(*PR);
+ initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
+ initializeAMDGPUAnnotateUniformValuesPass(*PR);
+}
+
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+ if (TT.getOS() == Triple::AMDHSA)
+ return make_unique<AMDGPUHSATargetObjectFile>();
+
+ return make_unique<AMDGPUTargetObjectFile>();
}
static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
@@ -72,15 +90,13 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
CodeGenOpt::Level OptLevel)
: LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM,
OptLevel),
- TLOF(new TargetLoweringObjectFileELF()), Subtarget(TT, CPU, FS, *this),
+ TLOF(createTLOF(getTargetTriple())), Subtarget(TT, CPU, FS, *this),
IntrinsicInfo() {
setRequiresStructuredCFG(true);
initAsmInfo();
}
-AMDGPUTargetMachine::~AMDGPUTargetMachine() {
- delete TLOF;
-}
+AMDGPUTargetMachine::~AMDGPUTargetMachine() { }
//===----------------------------------------------------------------------===//
// R600 Target Machine (R600 -> Cayman)
@@ -110,7 +126,13 @@ namespace {
class AMDGPUPassConfig : public TargetPassConfig {
public:
AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
- : TargetPassConfig(TM, PM) {}
+ : TargetPassConfig(TM, PM) {
+
+ // Exceptions and StackMaps are not supported, so these passes will never do
+ // anything.
+ disablePass(&StackMapLivenessID);
+ disablePass(&FuncletLayoutID);
+ }
AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
return getTM<AMDGPUTargetMachine>();
@@ -126,8 +148,9 @@ public:
void addIRPasses() override;
void addCodeGenPrepare() override;
- virtual bool addPreISel() override;
- virtual bool addInstSelector() override;
+ bool addPreISel() override;
+ bool addInstSelector() override;
+ bool addGCPasses() override;
};
class R600PassConfig : public AMDGPUPassConfig {
@@ -147,6 +170,8 @@ public:
: AMDGPUPassConfig(TM, PM) { }
bool addPreISel() override;
bool addInstSelector() override;
+ void addFastRegAlloc(FunctionPass *RegAllocPass) override;
+ void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
void addPreRegAlloc() override;
void addPostRegAlloc() override;
void addPreSched2() override;
@@ -156,7 +181,7 @@ public:
} // End of anonymous namespace
TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
- return TargetIRAnalysis([this](Function &F) {
+ return TargetIRAnalysis([this](const Function &F) {
return TargetTransformInfo(
AMDGPUTTIImpl(this, F.getParent()->getDataLayout()));
});
@@ -172,6 +197,10 @@ void AMDGPUPassConfig::addIRPasses() {
// functions, then we will generate code for the first function
// without ever running any passes on the second.
addPass(createBarrierNoopPass());
+
+ // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
+ addPass(createAMDGPUOpenCLImageTypeLoweringPass());
+
TargetPassConfig::addIRPasses();
}
@@ -198,6 +227,11 @@ bool AMDGPUPassConfig::addInstSelector() {
return false;
}
+bool AMDGPUPassConfig::addGCPasses() {
+ // Do nothing. GC is not supported.
+ return false;
+}
+
//===----------------------------------------------------------------------===//
// R600 Pass Setup
//===----------------------------------------------------------------------===//
@@ -238,16 +272,23 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();
+
+ // FIXME: We need to run a pass to propagate the attributes when calls are
+ // supported.
+ addPass(&AMDGPUAnnotateKernelFeaturesID);
+
addPass(createSinkingPass());
addPass(createSITypeRewriter());
addPass(createSIAnnotateControlFlowPass());
+ addPass(createAMDGPUAnnotateUniformValues());
+
return false;
}
bool GCNPassConfig::addInstSelector() {
AMDGPUPassConfig::addInstSelector();
addPass(createSILowerI1CopiesPass());
- addPass(createSIFixSGPRCopiesPass(*TM));
+ addPass(&SIFixSGPRCopiesID);
addPass(createSIFoldOperandsPass());
return false;
}
@@ -259,7 +300,6 @@ void GCNPassConfig::addPreRegAlloc() {
// earlier passes might recompute live intervals.
// TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass
if (getOptLevel() > CodeGenOpt::None) {
- initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry());
insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
}
@@ -269,16 +309,27 @@ void GCNPassConfig::addPreRegAlloc() {
// This should be run after scheduling, but before register allocation. It
// also need extra copies to the address operand to be eliminated.
- initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
insertPass(&MachineSchedulerID, &RegisterCoalescerID);
}
addPass(createSIShrinkInstructionsPass(), false);
- addPass(createSIFixSGPRLiveRangesPass(), false);
+}
+
+void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+ addPass(&SIFixSGPRLiveRangesID);
+ TargetPassConfig::addFastRegAlloc(RegAllocPass);
+}
+
+void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+ // We want to run this after LiveVariables is computed to avoid computing them
+ // twice.
+ // FIXME: We shouldn't disable the verifier here. r249087 introduced a failure
+ // that needs to be fixed.
+ insertPass(&LiveVariablesID, &SIFixSGPRLiveRangesID, /*VerifyAfter=*/false);
+ TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
}
void GCNPassConfig::addPostRegAlloc() {
- addPass(createSIPrepareScratchRegs(), false);
addPass(createSIShrinkInstructionsPass(), false);
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 14792e3..236e3f8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -32,7 +32,7 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
private:
protected:
- TargetLoweringObjectFile *TLOF;
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
AMDGPUSubtarget Subtarget;
AMDGPUIntrinsicInfo IntrinsicInfo;
@@ -52,7 +52,7 @@ public:
TargetIRAnalysis getTargetIRAnalysis() override;
TargetLoweringObjectFile *getObjFileLowering() const override {
- return TLOF;
+ return TLOF.get();
}
};
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
new file mode 100644
index 0000000..e050f21
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -0,0 +1,87 @@
+//===-- AMDGPUHSATargetObjectFile.cpp - AMDGPU Object Files ---------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetObjectFile.h"
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Generic Object File
+//===----------------------------------------------------------------------===//
+
+MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
+ SectionKind Kind,
+ Mangler &Mang,
+ const TargetMachine &TM) const {
+ if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GV))
+ return TextSection;
+
+ return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM);
+}
+
+//===----------------------------------------------------------------------===//
+// HSA Object File
+//===----------------------------------------------------------------------===//
+
+
+void AMDGPUHSATargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM){
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+
+ TextSection = AMDGPU::getHSATextSection(Ctx);
+
+ DataGlobalAgentSection = AMDGPU::getHSADataGlobalAgentSection(Ctx);
+ DataGlobalProgramSection = AMDGPU::getHSADataGlobalProgramSection(Ctx);
+
+ RodataReadonlyAgentSection = AMDGPU::getHSARodataReadonlyAgentSection(Ctx);
+}
+
+bool AMDGPUHSATargetObjectFile::isAgentAllocationSection(
+ const char *SectionName) const {
+ return cast<MCSectionELF>(DataGlobalAgentSection)
+ ->getSectionName()
+ .equals(SectionName);
+}
+
+bool AMDGPUHSATargetObjectFile::isAgentAllocation(const GlobalValue *GV) const {
+ // Read-only segments can only have agent allocation.
+ return AMDGPU::isReadOnlySegment(GV) ||
+ (AMDGPU::isGlobalSegment(GV) && GV->hasSection() &&
+ isAgentAllocationSection(GV->getSection()));
+}
+
+bool AMDGPUHSATargetObjectFile::isProgramAllocation(
+ const GlobalValue *GV) const {
+ // The default for global segments is program allocation.
+ return AMDGPU::isGlobalSegment(GV) && !isAgentAllocation(GV);
+}
+
+MCSection *AMDGPUHSATargetObjectFile::SelectSectionForGlobal(
+ const GlobalValue *GV, SectionKind Kind,
+ Mangler &Mang,
+ const TargetMachine &TM) const {
+ if (Kind.isText() && !GV->hasComdat())
+ return getTextSection();
+
+ if (AMDGPU::isGlobalSegment(GV)) {
+ if (isAgentAllocation(GV))
+ return DataGlobalAgentSection;
+
+ if (isProgramAllocation(GV))
+ return DataGlobalProgramSection;
+ }
+
+ return AMDGPUTargetObjectFile::SelectSectionForGlobal(GV, Kind, Mang, TM);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
new file mode 100644
index 0000000..921341e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -0,0 +1,51 @@
+//===-- AMDGPUTargetObjectFile.h - AMDGPU Object Info ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares the AMDGPU-specific subclass of
+/// TargetLoweringObjectFile.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF {
+ public:
+ MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+ Mangler &Mang,
+ const TargetMachine &TM) const override;
+};
+
+class AMDGPUHSATargetObjectFile final : public AMDGPUTargetObjectFile {
+private:
+ MCSection *DataGlobalAgentSection;
+ MCSection *DataGlobalProgramSection;
+ MCSection *RodataReadonlyAgentSection;
+
+ bool isAgentAllocationSection(const char *SectionName) const;
+ bool isAgentAllocation(const GlobalValue *GV) const;
+ bool isProgramAllocation(const GlobalValue *GV) const;
+
+public:
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+ MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+ Mangler &Mang,
+ const TargetMachine &TM) const override;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 6dacc74..54a003d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -74,9 +74,109 @@ unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
}
-unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; }
+unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) {
+ return Vector ? 0 : 32;
+}
unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
// Semi-arbitrary large amount.
return 64;
}
+
+unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
+ // XXX - For some reason this isn't called for switch.
+ switch (Opcode) {
+ case Instruction::Br:
+ case Instruction::Ret:
+ return 10;
+ default:
+ return BaseT::getCFInstrCost(Opcode);
+ }
+}
+
+int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+ unsigned Index) {
+ switch (Opcode) {
+ case Instruction::ExtractElement:
+ // Dynamic indexing isn't free and is best avoided.
+ return Index == ~0u ? 2 : 0;
+ default:
+ return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+ }
+}
+
+static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
+ const IntrinsicInst *I) {
+ switch (I->getIntrinsicID()) {
+ default:
+ return false;
+ case Intrinsic::not_intrinsic:
+ // This means we have an intrinsic that isn't defined in
+ // IntrinsicsAMDGPU.td
+ break;
+
+ case Intrinsic::amdgcn_interp_p1:
+ case Intrinsic::amdgcn_interp_p2:
+ case Intrinsic::amdgcn_mbcnt_hi:
+ case Intrinsic::amdgcn_mbcnt_lo:
+ case Intrinsic::r600_read_tidig_x:
+ case Intrinsic::r600_read_tidig_y:
+ case Intrinsic::r600_read_tidig_z:
+ return true;
+ }
+
+ StringRef Name = I->getCalledFunction()->getName();
+ switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) {
+ default:
+ return false;
+ case AMDGPUIntrinsic::SI_tid:
+ case AMDGPUIntrinsic::SI_fs_interp:
+ return true;
+ }
+}
+
+static bool isArgPassedInSGPR(const Argument *A) {
+ const Function *F = A->getParent();
+ unsigned ShaderType = AMDGPU::getShaderType(*F);
+
+ // Arguments to compute shaders are never a source of divergence.
+ if (ShaderType == ShaderType::COMPUTE)
+ return true;
+
+ // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
+ if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) ||
+ F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal))
+ return true;
+
+ // Everything else is in VGPRs.
+ return false;
+}
+
+///
+/// \returns true if the result of the value could potentially be
+/// different across workitems in a wavefront.
+bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
+
+ if (const Argument *A = dyn_cast<Argument>(V))
+ return !isArgPassedInSGPR(A);
+
+ // Loads from the private address space are divergent, because threads
+ // can execute the load instruction with the same inputs and get different
+ // results.
+ //
+ // All other loads are not divergent, because if threads issue loads with the
+ // same arguments, they will always get the same result.
+ if (const LoadInst *Load = dyn_cast<LoadInst>(V))
+ return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+
+ if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
+ const TargetMachine &TM = getTLI()->getTargetMachine();
+ return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic);
+ }
+
+ // Assume all function calls are a source of divergence.
+ if (isa<CallInst>(V) || isa<InvokeInst>(V))
+ return true;
+
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index dee0a69..976afb0 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -60,6 +60,11 @@ public:
unsigned getNumberOfRegisters(bool Vector);
unsigned getRegisterBitWidth(bool Vector);
unsigned getMaxInterleaveFactor(unsigned VF);
+
+ unsigned getCFInstrCost(unsigned Opcode);
+
+ int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
+ bool isSourceOfDivergence(const Value *V) const;
};
} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index d918ac3..917efd1 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -185,7 +185,7 @@ protected:
MachinePostDominatorTree *PDT;
MachineLoopInfo *MLI;
const R600InstrInfo *TII;
- const AMDGPURegisterInfo *TRI;
+ const R600RegisterInfo *TRI;
// PRINT FUNCTIONS
/// Print the ordered Blocks.
@@ -881,7 +881,7 @@ bool AMDGPUCFGStructurizer::run() {
} //while, "one iteration" over the function.
MachineBasicBlock *EntryMBB =
- GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
+ &*GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
if (EntryMBB->succ_size() == 0) {
Finish = true;
DEBUG(
@@ -904,7 +904,7 @@ bool AMDGPUCFGStructurizer::run() {
} while (!Finish && MakeProgress);
// Misc wrap up to maintain the consistency of the Function representation.
- wrapup(GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
+ wrapup(&*GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
// Detach retired Block, release memory.
for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
@@ -1164,7 +1164,7 @@ int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep,
for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(),
E = ContMBB.end(); It != E; ++It) {
- (*It)->removeSuccessor(LoopHeader);
+ (*It)->removeSuccessor(LoopHeader, true);
}
numLoopcontPatternMatch += NumCont;
@@ -1353,7 +1353,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
// If MigrateTrue is true, then TrueBB is the block being "branched into"
// and if MigrateFalse is true, then FalseBB is the block being
// "branched into"
- //
+ //
// Here is the pseudo code for how I think the optimization should work:
// 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
// 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
@@ -1372,7 +1372,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
// the late machine optimization passes, however if we implement
// bool TargetRegisterInfo::requiresRegisterScavenging(
// const MachineFunction &MF)
- // and have it return true, liveness will be tracked correctly
+ // and have it return true, liveness will be tracked correctly
// by generic optimization passes. We will also need to make sure that
// all of our target-specific passes that run after regalloc and before
// the CFGStructurizer track liveness and we will need to modify this pass
@@ -1487,7 +1487,7 @@ void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
);
DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
- DstMBB->removeSuccessor(SrcMBB);
+ DstMBB->removeSuccessor(SrcMBB, true);
cloneSuccessorList(DstMBB, SrcMBB);
removeSuccessor(SrcMBB);
@@ -1537,9 +1537,9 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
if (TrueMBB) {
MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end());
- MBB->removeSuccessor(TrueMBB);
+ MBB->removeSuccessor(TrueMBB, true);
if (LandMBB && TrueMBB->succ_size()!=0)
- TrueMBB->removeSuccessor(LandMBB);
+ TrueMBB->removeSuccessor(LandMBB, true);
retireBlock(TrueMBB);
MLI->removeBlock(TrueMBB);
}
@@ -1548,9 +1548,9 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
insertInstrBefore(I, AMDGPU::ELSE);
MBB->splice(I, FalseMBB, FalseMBB->begin(),
FalseMBB->end());
- MBB->removeSuccessor(FalseMBB);
+ MBB->removeSuccessor(FalseMBB, true);
if (LandMBB && FalseMBB->succ_size() != 0)
- FalseMBB->removeSuccessor(LandMBB);
+ FalseMBB->removeSuccessor(LandMBB, true);
retireBlock(FalseMBB);
MLI->removeBlock(FalseMBB);
}
@@ -1570,8 +1570,7 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
- DstBlk->addSuccessor(LandMBB);
- DstBlk->removeSuccessor(DstBlk);
+ DstBlk->replaceSuccessor(DstBlk, LandMBB);
}
@@ -1592,7 +1591,7 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
//now branchInst can be erase safely
BranchMI->eraseFromParent();
//now take care of successors, retire blocks
- ExitingMBB->removeSuccessor(LandMBB);
+ ExitingMBB->removeSuccessor(LandMBB, true);
}
void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
@@ -1666,8 +1665,7 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
//srcBlk, oldBlk, newBlk
- PredMBB->removeSuccessor(MBB);
- PredMBB->addSuccessor(CloneMBB);
+ PredMBB->replaceSuccessor(MBB, CloneMBB);
// add all successor to cloneBlk
cloneSuccessorList(CloneMBB, MBB);
@@ -1695,10 +1693,7 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
);
SpliceEnd = SrcMBB->end();
} else {
- DEBUG(
- dbgs() << "migrateInstruction see branch instr\n" ;
- BranchMI->dump();
- );
+ DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI);
SpliceEnd = BranchMI;
}
DEBUG(
@@ -1711,7 +1706,7 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
DEBUG(
dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
- << "srcSize = " << SrcMBB->size() << "\n";
+ << "srcSize = " << SrcMBB->size() << '\n';
);
}
@@ -1743,7 +1738,7 @@ void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
// test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
while ((BranchMI = getLoopendBlockBranchInstr(MBB))
&& isUncondBranch(BranchMI)) {
- DEBUG(dbgs() << "Removing uncond branch instr"; BranchMI->dump(););
+ DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI);
BranchMI->eraseFromParent();
}
}
@@ -1759,10 +1754,10 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
assert(BranchMI && isCondBranch(BranchMI));
- DEBUG(dbgs() << "Removing unneeded cond branch instr"; BranchMI->dump(););
+ DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI);
BranchMI->eraseFromParent();
SHOWNEWBLK(MBB1, "Removing redundant successor");
- MBB->removeSuccessor(MBB1);
+ MBB->removeSuccessor(MBB1, true);
}
void AMDGPUCFGStructurizer::addDummyExitBlock(
diff --git a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 2018983..d9f753f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -28,7 +28,9 @@
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbolELF.h"
#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/ELF.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
@@ -83,6 +85,7 @@ public:
unsigned RegNo;
int Modifiers;
const MCRegisterInfo *TRI;
+ const MCSubtargetInfo *STI;
bool IsForcedVOP3;
};
@@ -102,7 +105,7 @@ public:
}
void addRegOperands(MCInst &Inst, unsigned N) const {
- Inst.addOperand(MCOperand::createReg(getReg()));
+ Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), *Reg.STI)));
}
void addRegOrImmOperands(MCInst &Inst, unsigned N) const {
@@ -215,6 +218,10 @@ public:
(isReg() && isRegClass(AMDGPU::SReg_64RegClassID));
}
+ bool isSCSrc64() const {
+ return (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)) || isInlineImm();
+ }
+
bool isVCSrc32() const {
return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID));
}
@@ -251,7 +258,22 @@ public:
return EndLoc;
}
- void print(raw_ostream &OS) const override { }
+ void print(raw_ostream &OS) const override {
+ switch (Kind) {
+ case Register:
+ OS << "<register " << getReg() << " mods: " << Reg.Modifiers << '>';
+ break;
+ case Immediate:
+ OS << getImm();
+ break;
+ case Token:
+ OS << '\'' << getToken() << '\'';
+ break;
+ case Expression:
+ OS << "<expr " << *Expr << '>';
+ break;
+ }
+ }
static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val, SMLoc Loc,
enum ImmTy Type = ImmTyNone,
@@ -278,10 +300,12 @@ public:
static std::unique_ptr<AMDGPUOperand> CreateReg(unsigned RegNo, SMLoc S,
SMLoc E,
const MCRegisterInfo *TRI,
+ const MCSubtargetInfo *STI,
bool ForceVOP3) {
auto Op = llvm::make_unique<AMDGPUOperand>(Register);
Op->Reg.RegNo = RegNo;
Op->Reg.TRI = TRI;
+ Op->Reg.STI = STI;
Op->Reg.Modifiers = -1;
Op->Reg.IsForcedVOP3 = ForceVOP3;
Op->StartLoc = S;
@@ -301,14 +325,32 @@ public:
bool isDSOffset01() const;
bool isSWaitCnt() const;
bool isMubufOffset() const;
+ bool isSMRDOffset() const;
+ bool isSMRDLiteralOffset() const;
};
class AMDGPUAsmParser : public MCTargetAsmParser {
- MCSubtargetInfo &STI;
const MCInstrInfo &MII;
MCAsmParser &Parser;
unsigned ForcedEncodingSize;
+
+ bool isSI() const {
+ return AMDGPU::isSI(getSTI());
+ }
+
+ bool isCI() const {
+ return AMDGPU::isCI(getSTI());
+ }
+
+ bool isVI() const {
+ return AMDGPU::isVI(getSTI());
+ }
+
+ bool hasSGPR102_SGPR103() const {
+ return !isVI();
+ }
+
/// @name Auto-generated Match Functions
/// {
@@ -323,20 +365,34 @@ private:
bool ParseDirectiveHSACodeObjectISA();
bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header);
bool ParseDirectiveAMDKernelCodeT();
+ bool ParseSectionDirectiveHSAText();
+ bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const;
+ bool ParseDirectiveAMDGPUHsaKernel();
+ bool ParseDirectiveAMDGPUHsaModuleGlobal();
+ bool ParseDirectiveAMDGPUHsaProgramGlobal();
+ bool ParseSectionDirectiveHSADataGlobalAgent();
+ bool ParseSectionDirectiveHSADataGlobalProgram();
+ bool ParseSectionDirectiveHSARodataReadonlyAgent();
public:
- AMDGPUAsmParser(MCSubtargetInfo &STI, MCAsmParser &_Parser,
+public:
+ enum AMDGPUMatchResultTy {
+ Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY
+ };
+
+ AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser,
const MCInstrInfo &MII,
const MCTargetOptions &Options)
- : MCTargetAsmParser(), STI(STI), MII(MII), Parser(_Parser),
- ForcedEncodingSize(0){
+ : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser),
+ ForcedEncodingSize(0) {
+ MCAsmParserExtension::Initialize(Parser);
- if (STI.getFeatureBits().none()) {
+ if (getSTI().getFeatureBits().none()) {
// Set default features.
- STI.ToggleFeature("SOUTHERN_ISLANDS");
+ copySTI().ToggleFeature("SOUTHERN_ISLANDS");
}
- setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+ setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
}
AMDGPUTargetStreamer &getTargetStreamer() {
@@ -420,10 +476,10 @@ struct OptionalOperand {
}
-static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) {
+static int getRegClass(bool IsVgpr, unsigned RegWidth) {
if (IsVgpr) {
switch (RegWidth) {
- default: llvm_unreachable("Unknown register width");
+ default: return -1;
case 1: return AMDGPU::VGPR_32RegClassID;
case 2: return AMDGPU::VReg_64RegClassID;
case 3: return AMDGPU::VReg_96RegClassID;
@@ -434,7 +490,7 @@ static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) {
}
switch (RegWidth) {
- default: llvm_unreachable("Unknown register width");
+ default: return -1;
case 1: return AMDGPU::SGPR_32RegClassID;
case 2: return AMDGPU::SGPR_64RegClassID;
case 4: return AMDGPU::SReg_128RegClassID;
@@ -443,16 +499,16 @@ static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) {
}
}
-static unsigned getRegForName(const StringRef &RegName) {
+static unsigned getRegForName(StringRef RegName) {
return StringSwitch<unsigned>(RegName)
.Case("exec", AMDGPU::EXEC)
.Case("vcc", AMDGPU::VCC)
- .Case("flat_scr", AMDGPU::FLAT_SCR)
+ .Case("flat_scratch", AMDGPU::FLAT_SCR)
.Case("m0", AMDGPU::M0)
.Case("scc", AMDGPU::SCC)
- .Case("flat_scr_lo", AMDGPU::FLAT_SCR_LO)
- .Case("flat_scr_hi", AMDGPU::FLAT_SCR_HI)
+ .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
+ .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
.Case("vcc_lo", AMDGPU::VCC_LO)
.Case("vcc_hi", AMDGPU::VCC_HI)
.Case("exec_lo", AMDGPU::EXEC_LO)
@@ -464,12 +520,14 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End
const AsmToken Tok = Parser.getTok();
StartLoc = Tok.getLoc();
EndLoc = Tok.getEndLoc();
- const StringRef &RegName = Tok.getString();
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+
+ StringRef RegName = Tok.getString();
RegNo = getRegForName(RegName);
if (RegNo) {
Parser.Lex();
- return false;
+ return !subtargetHasRegister(*TRI, RegNo);
}
// Match vgprs and sgprs
@@ -514,16 +572,24 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End
RegIndexInClass = RegLo;
} else {
// SGPR registers are aligned. Max alignment is 4 dwords.
- RegIndexInClass = RegLo / std::min(RegWidth, 4u);
+ unsigned Size = std::min(RegWidth, 4u);
+ if (RegLo % Size != 0)
+ return true;
+
+ RegIndexInClass = RegLo / Size;
}
}
- const MCRegisterInfo *TRC = getContext().getRegisterInfo();
- unsigned RC = getRegClass(IsVgpr, RegWidth);
- if (RegIndexInClass > TRC->getRegClass(RC).getNumRegs())
+ int RCID = getRegClass(IsVgpr, RegWidth);
+ if (RCID == -1)
return true;
- RegNo = TRC->getRegClass(RC).getRegister(RegIndexInClass);
- return false;
+
+ const MCRegisterClass RC = TRI->getRegClass(RCID);
+ if (RegIndexInClass >= RC.getNumRegs())
+ return true;
+
+ RegNo = RC.getRegister(RegIndexInClass);
+ return !subtargetHasRegister(*TRI, RegNo);
}
unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
@@ -534,6 +600,11 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
(getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3)))
return Match_InvalidOperand;
+ if ((TSFlags & SIInstrFlags::VOP3) &&
+ (TSFlags & SIInstrFlags::VOPAsmPrefer32Bit) &&
+ getForcedEncodingSize() != 64)
+ return Match_PreferE32;
+
return Match_Success;
}
@@ -549,7 +620,7 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
default: break;
case Match_Success:
Inst.setLoc(IDLoc);
- Out.EmitInstruction(Inst, STI);
+ Out.EmitInstruction(Inst, getSTI());
return false;
case Match_MissingFeature:
return Error(IDLoc, "instruction not supported on this GPU");
@@ -592,6 +663,9 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
}
return Error(ErrorLoc, "invalid operand for instruction");
}
+ case Match_PreferE32:
+ return Error(IDLoc, "internal error: instruction without _e64 suffix "
+ "should be encoded as e32");
}
llvm_unreachable("Implement any new match types added!");
}
@@ -640,7 +714,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
// If this directive has no arguments, then use the ISA version for the
// targeted GPU.
if (getLexer().is(AsmToken::EndOfStatement)) {
- AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(STI.getFeatureBits());
+ AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits());
getTargetStreamer().EmitDirectiveHSACodeObjectISA(Isa.Major, Isa.Minor,
Isa.Stepping,
"AMD", "AMDGPU");
@@ -852,7 +926,7 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
amd_kernel_code_t Header;
- AMDGPU::initDefaultAMDKernelCodeT(Header, STI.getFeatureBits());
+ AMDGPU::initDefaultAMDKernelCodeT(Header, getSTI().getFeatureBits());
while (true) {
@@ -882,6 +956,64 @@ bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
return false;
}
+bool AMDGPUAsmParser::ParseSectionDirectiveHSAText() {
+ getParser().getStreamer().SwitchSection(
+ AMDGPU::getHSATextSection(getContext()));
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() {
+ if (getLexer().isNot(AsmToken::Identifier))
+ return TokError("expected symbol name");
+
+ StringRef KernelName = Parser.getTok().getString();
+
+ getTargetStreamer().EmitAMDGPUSymbolType(KernelName,
+ ELF::STT_AMDGPU_HSA_KERNEL);
+ Lex();
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaModuleGlobal() {
+ if (getLexer().isNot(AsmToken::Identifier))
+ return TokError("expected symbol name");
+
+ StringRef GlobalName = Parser.getTok().getIdentifier();
+
+ getTargetStreamer().EmitAMDGPUHsaModuleScopeGlobal(GlobalName);
+ Lex();
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaProgramGlobal() {
+ if (getLexer().isNot(AsmToken::Identifier))
+ return TokError("expected symbol name");
+
+ StringRef GlobalName = Parser.getTok().getIdentifier();
+
+ getTargetStreamer().EmitAMDGPUHsaProgramScopeGlobal(GlobalName);
+ Lex();
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalAgent() {
+ getParser().getStreamer().SwitchSection(
+ AMDGPU::getHSADataGlobalAgentSection(getContext()));
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalProgram() {
+ getParser().getStreamer().SwitchSection(
+ AMDGPU::getHSADataGlobalProgramSection(getContext()));
+ return false;
+}
+
+bool AMDGPUAsmParser::ParseSectionDirectiveHSARodataReadonlyAgent() {
+ getParser().getStreamer().SwitchSection(
+ AMDGPU::getHSARodataReadonlyAgentSection(getContext()));
+ return false;
+}
+
bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
StringRef IDVal = DirectiveID.getString();
@@ -894,6 +1026,55 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
if (IDVal == ".amd_kernel_code_t")
return ParseDirectiveAMDKernelCodeT();
+ if (IDVal == ".hsatext" || IDVal == ".text")
+ return ParseSectionDirectiveHSAText();
+
+ if (IDVal == ".amdgpu_hsa_kernel")
+ return ParseDirectiveAMDGPUHsaKernel();
+
+ if (IDVal == ".amdgpu_hsa_module_global")
+ return ParseDirectiveAMDGPUHsaModuleGlobal();
+
+ if (IDVal == ".amdgpu_hsa_program_global")
+ return ParseDirectiveAMDGPUHsaProgramGlobal();
+
+ if (IDVal == ".hsadata_global_agent")
+ return ParseSectionDirectiveHSADataGlobalAgent();
+
+ if (IDVal == ".hsadata_global_program")
+ return ParseSectionDirectiveHSADataGlobalProgram();
+
+ if (IDVal == ".hsarodata_readonly_agent")
+ return ParseSectionDirectiveHSARodataReadonlyAgent();
+
+ return true;
+}
+
+bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
+ unsigned RegNo) const {
+ if (isCI())
+ return true;
+
+ if (isSI()) {
+ // No flat_scr
+ switch (RegNo) {
+ case AMDGPU::FLAT_SCR:
+ case AMDGPU::FLAT_SCR_LO:
+ case AMDGPU::FLAT_SCR_HI:
+ return false;
+ default:
+ return true;
+ }
+ }
+
+ // VI only has 102 SGPRs, so make sure we aren't trying to use the 2 more that
+ // SI/CI have.
+ for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true);
+ R.isValid(); ++R) {
+ if (*R == RegNo)
+ return false;
+ }
+
return true;
}
@@ -943,13 +1124,11 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
int64_t IntVal;
if (getParser().parseAbsoluteExpression(IntVal))
return MatchOperand_ParseFail;
- APInt IntVal32(32, IntVal);
- if (IntVal32.getSExtValue() != IntVal) {
+ if (!isInt<32>(IntVal) && !isUInt<32>(IntVal)) {
Error(S, "invalid immediate: only 32-bit values are legal");
return MatchOperand_ParseFail;
}
- IntVal = IntVal32.getSExtValue();
if (Negate)
IntVal *= -1;
Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S));
@@ -1002,7 +1181,7 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
Operands.push_back(AMDGPUOperand::CreateReg(
- RegNo, S, E, getContext().getRegisterInfo(),
+ RegNo, S, E, getContext().getRegisterInfo(), &getSTI(),
isForcedVOP3()));
if (HasModifiers || Modifiers) {
@@ -1571,6 +1750,23 @@ AMDGPUAsmParser::parseR128(OperandVector &Operands) {
}
//===----------------------------------------------------------------------===//
+// smrd
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUOperand::isSMRDOffset() const {
+
+ // FIXME: Support 20-bit offsets on VI. We need to to pass subtarget
+ // information here.
+ return isImm() && isUInt<8>(getImm());
+}
+
+bool AMDGPUOperand::isSMRDLiteralOffset() const {
+ // 32-bit literals are only supported on CI and we only want to use them
+ // when the offset is > 8-bits.
+ return isImm() && !isUInt<8>(getImm()) && isUInt<32>(getImm());
+}
+
+//===----------------------------------------------------------------------===//
// vop3
//===----------------------------------------------------------------------===//
@@ -1653,8 +1849,12 @@ AMDGPUAsmParser::parseVOP3OptionalOps(OperandVector &Operands) {
}
void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
- ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1);
- unsigned i = 2;
+
+ unsigned i = 1;
+ const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+ if (Desc.getNumDefs() > 0) {
+ ((AMDGPUOperand &)*Operands[i++]).addRegOperands(Inst, 1);
+ }
std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
diff --git a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td
index 2f5fdbe..88a090d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td
@@ -8,6 +8,22 @@
//===----------------------------------------------------------------------===//
// Instruction definitions for CI and newer.
//===----------------------------------------------------------------------===//
+// Remaining instructions:
+// S_CBRANCH_CDBGUSER
+// S_CBRANCH_CDBGSYS
+// S_CBRANCH_CDBGSYS_OR_USER
+// S_CBRANCH_CDBGSYS_AND_USER
+// DS_NOP
+// DS_GWS_SEMA_RELEASE_ALL
+// DS_WRAP_RTN_B32
+// DS_CNDXCHG32_RTN_B64
+// DS_WRITE_B96
+// DS_WRITE_B128
+// DS_CONDXCHG32_RTN_B128
+// DS_READ_B96
+// DS_READ_B128
+// BUFFER_LOAD_DWORDX3
+// BUFFER_STORE_DWORDX3
def isCIVI : Predicate <
@@ -23,6 +39,7 @@ def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
let SubtargetPredicate = isCIVI in {
+let SchedRW = [WriteDoubleAdd] in {
defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64",
VOP_F64_F64, ftrunc
>;
@@ -35,82 +52,218 @@ defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64",
defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64",
VOP_F64_F64, frint
>;
+} // End SchedRW = [WriteDoubleAdd]
+
+let SchedRW = [WriteQuarterRate32] in {
defm V_LOG_LEGACY_F32 : VOP1Inst <vop1<0x45, 0x4c>, "v_log_legacy_f32",
VOP_F32_F32
>;
defm V_EXP_LEGACY_F32 : VOP1Inst <vop1<0x46, 0x4b>, "v_exp_legacy_f32",
VOP_F32_F32
>;
+} // End SchedRW = [WriteQuarterRate32]
+
+//===----------------------------------------------------------------------===//
+// VOP3 Instructions
+//===----------------------------------------------------------------------===//
+
+defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8",
+ VOP_I32_I32_I32
+>;
+defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8",
+ VOP_I32_I32_I32
+>;
+defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8",
+ VOP_I32_I32_I32
+>;
+
+let isCommutable = 1 in {
+defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32",
+ VOP_I64_I32_I32_I64
+>;
+
+// XXX - Does this set VCC?
+defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
+ VOP_I64_I32_I32_I64
+>;
+} // End isCommutable = 1
+
+
+//===----------------------------------------------------------------------===//
+// DS Instructions
+//===----------------------------------------------------------------------===//
+defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">;
+
+// DS_CONDXCHG32_RTN_B64
+// DS_CONDXCHG32_RTN_B128
+
+//===----------------------------------------------------------------------===//
+// SMRD Instructions
+//===----------------------------------------------------------------------===//
+
+defm S_DCACHE_INV_VOL : SMRD_Inval <smrd<0x1d, 0x22>,
+ "s_dcache_inv_vol", int_amdgcn_s_dcache_inv_vol>;
+
+//===----------------------------------------------------------------------===//
+// MUBUF Instructions
+//===----------------------------------------------------------------------===//
+
+defm BUFFER_WBINVL1_VOL : MUBUF_Invalidate <mubuf<0x70, 0x3f>,
+ "buffer_wbinvl1_vol", int_amdgcn_buffer_wbinvl1_vol
+>;
//===----------------------------------------------------------------------===//
// Flat Instructions
//===----------------------------------------------------------------------===//
-def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x8, "flat_load_ubyte", VGPR_32>;
-def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x9, "flat_load_sbyte", VGPR_32>;
-def FLAT_LOAD_USHORT : FLAT_Load_Helper <0xa, "flat_load_ushort", VGPR_32>;
-def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0xb, "flat_load_sshort", VGPR_32>;
-def FLAT_LOAD_DWORD : FLAT_Load_Helper <0xc, "flat_load_dword", VGPR_32>;
-def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0xd, "flat_load_dwordx2", VReg_64>;
-def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0xe, "flat_load_dwordx4", VReg_128>;
-def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0xf, "flat_load_dwordx3", VReg_96>;
-def FLAT_STORE_BYTE : FLAT_Store_Helper <0x18, "flat_store_byte", VGPR_32>;
-def FLAT_STORE_SHORT : FLAT_Store_Helper <0x1a, "flat_store_short", VGPR_32>;
-def FLAT_STORE_DWORD : FLAT_Store_Helper <0x1c, "flat_store_dword", VGPR_32>;
-def FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
- 0x1d, "flat_store_dwordx2", VReg_64
+defm FLAT_LOAD_UBYTE : FLAT_Load_Helper <
+ flat<0x8, 0x10>, "flat_load_ubyte", VGPR_32
+>;
+defm FLAT_LOAD_SBYTE : FLAT_Load_Helper <
+ flat<0x9, 0x11>, "flat_load_sbyte", VGPR_32
+>;
+defm FLAT_LOAD_USHORT : FLAT_Load_Helper <
+ flat<0xa, 0x12>, "flat_load_ushort", VGPR_32
+>;
+defm FLAT_LOAD_SSHORT : FLAT_Load_Helper <
+ flat<0xb, 0x13>, "flat_load_sshort", VGPR_32>
+;
+defm FLAT_LOAD_DWORD : FLAT_Load_Helper <
+ flat<0xc, 0x14>, "flat_load_dword", VGPR_32
+>;
+defm FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <
+ flat<0xd, 0x15>, "flat_load_dwordx2", VReg_64
+>;
+defm FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <
+ flat<0xe, 0x17>, "flat_load_dwordx4", VReg_128
+>;
+defm FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <
+ flat<0xf, 0x16>, "flat_load_dwordx3", VReg_96
+>;
+defm FLAT_STORE_BYTE : FLAT_Store_Helper <
+ flat<0x18>, "flat_store_byte", VGPR_32
+>;
+defm FLAT_STORE_SHORT : FLAT_Store_Helper <
+ flat <0x1a>, "flat_store_short", VGPR_32
+>;
+defm FLAT_STORE_DWORD : FLAT_Store_Helper <
+ flat<0x1c>, "flat_store_dword", VGPR_32
+>;
+defm FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
+ flat<0x1d>, "flat_store_dwordx2", VReg_64
+>;
+defm FLAT_STORE_DWORDX4 : FLAT_Store_Helper <
+ flat<0x1e, 0x1f>, "flat_store_dwordx4", VReg_128
>;
-def FLAT_STORE_DWORDX4 : FLAT_Store_Helper <
- 0x1e, "flat_store_dwordx4", VReg_128
+defm FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
+ flat<0x1f, 0x1e>, "flat_store_dwordx3", VReg_96
>;
-def FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
- 0x1f, "flat_store_dwordx3", VReg_96
+defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <
+ flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32
>;
-defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <0x30, "flat_atomic_swap", VGPR_32>;
defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC <
- 0x31, "flat_atomic_cmpswap", VGPR_32, VReg_64
->;
-defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <0x32, "flat_atomic_add", VGPR_32>;
-defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <0x33, "flat_atomic_sub", VGPR_32>;
-defm FLAT_ATOMIC_RSUB : FLAT_ATOMIC <0x34, "flat_atomic_rsub", VGPR_32>;
-defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <0x35, "flat_atomic_smin", VGPR_32>;
-defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <0x36, "flat_atomic_umin", VGPR_32>;
-defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <0x37, "flat_atomic_smax", VGPR_32>;
-defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <0x38, "flat_atomic_umax", VGPR_32>;
-defm FLAT_ATOMIC_AND : FLAT_ATOMIC <0x39, "flat_atomic_and", VGPR_32>;
-defm FLAT_ATOMIC_OR : FLAT_ATOMIC <0x3a, "flat_atomic_or", VGPR_32>;
-defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <0x3b, "flat_atomic_xor", VGPR_32>;
-defm FLAT_ATOMIC_INC : FLAT_ATOMIC <0x3c, "flat_atomic_inc", VGPR_32>;
-defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <0x3d, "flat_atomic_dec", VGPR_32>;
-defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC <
- 0x3e, "flat_atomic_fcmpswap", VGPR_32, VReg_64
+ flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, VReg_64
+>;
+defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <
+ flat<0x32, 0x42>, "flat_atomic_add", VGPR_32
+>;
+defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <
+ flat<0x33, 0x43>, "flat_atomic_sub", VGPR_32
+>;
+defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <
+ flat<0x35, 0x44>, "flat_atomic_smin", VGPR_32
+>;
+defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <
+ flat<0x36, 0x45>, "flat_atomic_umin", VGPR_32
+>;
+defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <
+ flat<0x37, 0x46>, "flat_atomic_smax", VGPR_32
+>;
+defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <
+ flat<0x38, 0x47>, "flat_atomic_umax", VGPR_32
+>;
+defm FLAT_ATOMIC_AND : FLAT_ATOMIC <
+ flat<0x39, 0x48>, "flat_atomic_and", VGPR_32
+>;
+defm FLAT_ATOMIC_OR : FLAT_ATOMIC <
+ flat<0x3a, 0x49>, "flat_atomic_or", VGPR_32
+>;
+defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <
+ flat<0x3b, 0x4a>, "flat_atomic_xor", VGPR_32
+>;
+defm FLAT_ATOMIC_INC : FLAT_ATOMIC <
+ flat<0x3c, 0x4b>, "flat_atomic_inc", VGPR_32
+>;
+defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <
+ flat<0x3d, 0x4c>, "flat_atomic_dec", VGPR_32
+>;
+defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <
+ flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64
>;
-defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <0x3f, "flat_atomic_fmin", VGPR_32>;
-defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <0x40, "flat_atomic_fmax", VGPR_32>;
-defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <0x50, "flat_atomic_swap_x2", VReg_64>;
defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC <
- 0x51, "flat_atomic_cmpswap_x2", VReg_64, VReg_128
->;
-defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <0x52, "flat_atomic_add_x2", VReg_64>;
-defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <0x53, "flat_atomic_sub_x2", VReg_64>;
-defm FLAT_ATOMIC_RSUB_X2 : FLAT_ATOMIC <0x54, "flat_atomic_rsub_x2", VReg_64>;
-defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <0x55, "flat_atomic_smin_x2", VReg_64>;
-defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <0x56, "flat_atomic_umin_x2", VReg_64>;
-defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <0x57, "flat_atomic_smax_x2", VReg_64>;
-defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <0x58, "flat_atomic_umax_x2", VReg_64>;
-defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <0x59, "flat_atomic_and_x2", VReg_64>;
-defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <0x5a, "flat_atomic_or_x2", VReg_64>;
-defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <0x5b, "flat_atomic_xor_x2", VReg_64>;
-defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <0x5c, "flat_atomic_inc_x2", VReg_64>;
-defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <0x5d, "flat_atomic_dec_x2", VReg_64>;
-defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC <
- 0x5e, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128
+ flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, VReg_128
+>;
+defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <
+ flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64
+>;
+defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <
+ flat<0x53, 0x63>, "flat_atomic_sub_x2", VReg_64
+>;
+defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <
+ flat<0x55, 0x64>, "flat_atomic_smin_x2", VReg_64
+>;
+defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <
+ flat<0x56, 0x65>, "flat_atomic_umin_x2", VReg_64
+>;
+defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <
+ flat<0x57, 0x66>, "flat_atomic_smax_x2", VReg_64
+>;
+defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <
+ flat<0x58, 0x67>, "flat_atomic_umax_x2", VReg_64
+>;
+defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <
+ flat<0x59, 0x68>, "flat_atomic_and_x2", VReg_64
+>;
+defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <
+ flat<0x5a, 0x69>, "flat_atomic_or_x2", VReg_64
+>;
+defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <
+ flat<0x5b, 0x6a>, "flat_atomic_xor_x2", VReg_64
+>;
+defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <
+ flat<0x5c, 0x6b>, "flat_atomic_inc_x2", VReg_64
+>;
+defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <
+ flat<0x5d, 0x6c>, "flat_atomic_dec_x2", VReg_64
>;
-defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <0x5f, "flat_atomic_fmin_x2", VReg_64>;
-defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <0x60, "flat_atomic_fmax_x2", VReg_64>;
} // End SubtargetPredicate = isCIVI
+// CI Only flat instructions
+
+let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst in {
+
+defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC <
+ flat<0x3e>, "flat_atomic_fcmpswap", VGPR_32, VReg_64
+>;
+defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <
+ flat<0x3f>, "flat_atomic_fmin", VGPR_32
+>;
+defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <
+ flat<0x40>, "flat_atomic_fmax", VGPR_32
+>;
+defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC <
+ flat<0x5e>, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128
+>;
+defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <
+ flat<0x5f>, "flat_atomic_fmin_x2", VReg_64
+>;
+defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <
+ flat<0x60>, "flat_atomic_fmax_x2", VReg_64
+>;
+
+} // End let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst
+
//===----------------------------------------------------------------------===//
// Flat Patterns
//===----------------------------------------------------------------------===//
@@ -147,3 +300,80 @@ def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>;
} // End HasFlatAddressSpace predicate
+let Predicates = [isCI] in {
+
+// Convert (x - floor(x)) to fract(x)
+def : Pat <
+ (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
+ (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
+ (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+// Convert (x + (-floor(x))) to fract(x)
+def : Pat <
+ (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
+ (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
+ (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+} // End Predicates = [isCI]
+
+
+//===----------------------------------------------------------------------===//
+// Patterns to generate flat for global
+//===----------------------------------------------------------------------===//
+
+def useFlatForGlobal : Predicate <
+ "Subtarget->useFlatForGlobal() || "
+ "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">;
+
+let Predicates = [useFlatForGlobal] in {
+
+// 1. Offset as 20bit DWORD immediate
+def : Pat <
+ (SIload_constant v4i32:$sbase, IMM20bit:$offset),
+ (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
+>;
+
+// Patterns for global loads with no offset
+class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+ (vt (node i64:$addr)),
+ (inst $addr, 0, 0, 0)
+>;
+
+def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>;
+
+class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+ (node vt:$data, i64:$addr),
+ (inst $data, $addr, 0, 0, 0)
+>;
+
+def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>;
+def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>;
+def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>;
+
+class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+ (vt (node i64:$addr, vt:$data)),
+ (inst $addr, $data, 0, 0)
+>;
+
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
+
+} // End Predicates = [useFlatForGlobal]
diff --git a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td
index ba4df82..a6c3785 100644
--- a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td
@@ -82,6 +82,10 @@ def RAT_STORE_DWORD32 : RAT_STORE_DWORD <R600_TReg32_X, i32, 0x1>;
def RAT_STORE_DWORD64 : RAT_STORE_DWORD <R600_Reg64, v2i32, 0x3>;
def RAT_STORE_DWORD128 : RAT_STORE_DWORD <R600_Reg128, v4i32, 0xf>;
+def RAT_STORE_TYPED_cm: CF_MEM_RAT_STORE_TYPED<0> {
+ let eop = 0; // This bit is not used on Cayman.
+}
+
class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
: VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> {
diff --git a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 7adcd46..779a14e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -40,6 +40,15 @@ class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name,
: EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins,
"MEM_RAT "#name, pattern>;
+class CF_MEM_RAT_STORE_TYPED<bits<1> has_eop>
+ : CF_MEM_RAT <0x1, ?, (ins R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr,
+ i32imm:$rat_id, InstFlag:$eop),
+ "STORE_TYPED RAT($rat_id) $rw_gpr, $index_gpr"
+ #!if(has_eop, ", $eop", ""),
+ [(int_r600_rat_store_typed R600_Reg128:$rw_gpr,
+ R600_Reg128:$index_gpr,
+ (i32 imm:$rat_id))]>;
+
def RAT_MSKOR : CF_MEM_RAT <0x11, 0,
(ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
"MSKOR $rw_gpr.XW, $index_gpr",
@@ -105,6 +114,8 @@ def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf,
[(global_store v4i32:$rw_gpr, i32:$index_gpr)]
>;
+def RAT_STORE_TYPED_eg: CF_MEM_RAT_STORE_TYPED<1>;
+
} // End usesCustomInserter = 1
class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index e811d5c..a187de8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -16,6 +16,7 @@
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@@ -283,8 +284,13 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) {
O << "4.0";
else if (Imm == DoubleToBits(-4.0))
O << "-4.0";
- else
- llvm_unreachable("64-bit literal constants not supported");
+ else {
+ assert(isUInt<32>(Imm));
+
+ // In rare situations, we will have a 32-bit literal in a 64-bit
+ // operand. This is technically allowed for the encoding of s_mov_b64.
+ O << formatHex(static_cast<uint64_t>(Imm));
+ }
}
void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
@@ -592,11 +598,11 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
} else {
unsigned Stream = (SImm16 >> 8) & 0x3;
if (Op == 1)
- O << "cut";
+ O << "cut";
else if (Op == 2)
- O << "emit";
+ O << "emit";
else if (Op == 3)
- O << "emit-cut";
+ O << "emit-cut";
O << " stream " << Stream;
}
O << "), [m0] ";
diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index 14fb511..90541d8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -13,9 +13,7 @@
#ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
#define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
-#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/Support/raw_ostream.h"
namespace llvm {
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 4434d9b..60e8c8f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -99,14 +99,22 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
case AMDGPU::fixup_si_rodata: {
uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
- *Dst = Value;
- break;
- }
-
- case AMDGPU::fixup_si_end_of_text: {
- uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
- // The value points to the last instruction in the text section, so we
- // need to add 4 bytes to get to the start of the constants.
+ // We emit constant data at the end of the text section and generate its
+ // address using the following code sequence:
+ // s_getpc_b64 s[0:1]
+ // s_add_u32 s0, s0, $symbol
+ // s_addc_u32 s1, s1, 0
+ //
+ // s_getpc_b64 returns the address of the s_add_u32 instruction and then
+ // the fixup replaces $symbol with a literal constant, which is a
+ // pc-relative offset from the encoding of the $symbol operand to the
+ // constant data.
+ //
+ // What we want here is an offset from the start of the s_add_u32
+ // instruction to the constant data, but since the encoding of $symbol
+ // starts 4 bytes after the start of the add instruction, we end up
+ // with an offset that is 4 bytes too small. This requires us to
+ // add 4 to the fixup value before applying it.
*Dst = Value + 4;
break;
}
@@ -136,8 +144,7 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = {
// name offset bits flags
{ "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
- { "fixup_si_rodata", 0, 32, 0 },
- { "fixup_si_end_of_text", 0, 32, MCFixupKindInfo::FKF_IsPCRel }
+ { "fixup_si_rodata", 0, 32, MCFixupKindInfo::FKF_IsPCRel }
};
if (Kind < FirstTargetFixupKind)
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
new file mode 100644
index 0000000..9ff9fe7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp
@@ -0,0 +1,26 @@
+//===-------- AMDGPUELFStreamer.cpp - ELF Object Output -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUELFStreamer.h"
+#include "Utils/AMDGPUBaseInfo.h"
+
+using namespace llvm;
+
+void AMDGPUELFStreamer::InitSections(bool NoExecStack) {
+ // Start with the .hsatext section by default.
+ SwitchSection(AMDGPU::getHSATextSection(getContext()));
+}
+
+MCELFStreamer *llvm::createAMDGPUELFStreamer(MCContext &Context,
+ MCAsmBackend &MAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter,
+ bool RelaxAll) {
+ return new AMDGPUELFStreamer(Context, MAB, OS, Emitter);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
new file mode 100644
index 0000000..488d7e7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h
@@ -0,0 +1,40 @@
+//===-------- AMDGPUELFStreamer.h - ELF Object Output ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a custom MCELFStreamer which allows us to insert some hooks before
+// emitting data into an actual object file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCSubtargetInfo;
+
+class AMDGPUELFStreamer : public MCELFStreamer {
+public:
+ AMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter)
+ : MCELFStreamer(Context, MAB, OS, Emitter) { }
+
+ virtual void InitSections(bool NoExecStac) override;
+};
+
+MCELFStreamer *createAMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll);
+} // namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
index 01021d6..59a9178 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h
@@ -21,9 +21,6 @@ enum Fixups {
/// fixup for global addresses with constant initializers
fixup_si_rodata,
- /// fixup for offset from instruction to end of text section
- fixup_si_end_of_text,
-
// Marker
LastTargetFixupKind,
NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 028a86d..68b1d1a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -22,13 +22,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
InlineAsmEnd = ";#ASMEND";
//===--- Data Emission Directives -------------------------------------===//
- ZeroDirective = ".zero";
- AsciiDirective = ".ascii\t";
- AscizDirective = ".asciz\t";
- Data8bitsDirective = ".byte\t";
- Data16bitsDirective = ".short\t";
- Data32bitsDirective = ".long\t";
- Data64bitsDirective = ".quad\t";
SunStyleELFSectionSwitchSyntax = true;
UsesELFSectionDirectiveForBSS = true;
@@ -41,3 +34,10 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
//===--- Dwarf Emission Directives -----------------------------------===//
SupportsDebugInformation = true;
}
+
+bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const {
+ return SectionName == ".hsatext" || SectionName == ".hsadata_global_agent" ||
+ SectionName == ".hsadata_global_program" ||
+ SectionName == ".hsarodata_readonly_agent" ||
+ MCAsmInfo::shouldOmitSectionDirective(SectionName);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
index a5bac51..a546961 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -21,12 +21,13 @@ class Triple;
// If you need to create another MCAsmInfo class, which inherits from MCAsmInfo,
// you will need to make sure your new class sets PrivateGlobalPrefix to
-// a prefix that won't appeary in a fuction name. The default value
+// a prefix that won't appear in a function name. The default value
// for PrivateGlobalPrefix is 'L', so it will consider any function starting
// with 'L' as a local symbol.
class AMDGPUMCAsmInfo : public MCAsmInfoELF {
public:
explicit AMDGPUMCAsmInfo(const Triple &TT);
+ bool shouldOmitSectionDirective(StringRef SectionName) const override;
};
} // namespace llvm
#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index c709741..f704094 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUMCTargetDesc.h"
+#include "AMDGPUELFStreamer.h"
#include "AMDGPUMCAsmInfo.h"
#include "AMDGPUTargetStreamer.h"
#include "InstPrinter/AMDGPUInstPrinter.h"
@@ -85,6 +86,15 @@ static MCTargetStreamer * createAMDGPUObjectTargetStreamer(
return new AMDGPUTargetELFStreamer(S);
}
+static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context,
+ MCAsmBackend &MAB, raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll) {
+ if (T.getOS() == Triple::AMDHSA)
+ return createAMDGPUELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+
+ return createELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+}
+
extern "C" void LLVMInitializeAMDGPUTargetMC() {
for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) {
RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T);
@@ -95,6 +105,7 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() {
TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo);
TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter);
TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend);
+ TargetRegistry::RegisterELFStreamer(*T, createMCStreamer);
}
// R600 specific registration
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 09e6cb1..b91134d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -13,6 +13,7 @@
#include "AMDGPUTargetStreamer.h"
#include "SIDefines.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/Twine.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELFStreamer.h"
@@ -220,6 +221,26 @@ AMDGPUTargetAsmStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
}
+void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
+ unsigned Type) {
+ switch (Type) {
+ default: llvm_unreachable("Invalid AMDGPU symbol type");
+ case ELF::STT_AMDGPU_HSA_KERNEL:
+ OS << "\t.amdgpu_hsa_kernel " << SymbolName << '\n' ;
+ break;
+ }
+}
+
+void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaModuleScopeGlobal(
+ StringRef GlobalName) {
+ OS << "\t.amdgpu_hsa_module_global " << GlobalName << '\n';
+}
+
+void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal(
+ StringRef GlobalName) {
+ OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n';
+}
+
//===----------------------------------------------------------------------===//
// AMDGPUTargetELFStreamer
//===----------------------------------------------------------------------===//
@@ -291,7 +312,35 @@ AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
MCStreamer &OS = getStreamer();
OS.PushSection();
- OS.SwitchSection(OS.getContext().getObjectFileInfo()->getTextSection());
+ // The MCObjectFileInfo that is available to the assembler is a generic
+ // implementation and not AMDGPUHSATargetObjectFile, so we can't use
+ // MCObjectFileInfo::getTextSection() here for fetching the HSATextSection.
+ OS.SwitchSection(AMDGPU::getHSATextSection(OS.getContext()));
OS.EmitBytes(StringRef((const char*)&Header, sizeof(Header)));
OS.PopSection();
}
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
+ unsigned Type) {
+ MCSymbolELF *Symbol = cast<MCSymbolELF>(
+ getStreamer().getContext().getOrCreateSymbol(SymbolName));
+ Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL);
+}
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUHsaModuleScopeGlobal(
+ StringRef GlobalName) {
+
+ MCSymbolELF *Symbol = cast<MCSymbolELF>(
+ getStreamer().getContext().getOrCreateSymbol(GlobalName));
+ Symbol->setType(ELF::STT_OBJECT);
+ Symbol->setBinding(ELF::STB_LOCAL);
+}
+
+void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal(
+ StringRef GlobalName) {
+
+ MCSymbolELF *Symbol = cast<MCSymbolELF>(
+ getStreamer().getContext().getOrCreateSymbol(GlobalName));
+ Symbol->setType(ELF::STT_OBJECT);
+ Symbol->setBinding(ELF::STB_GLOBAL);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index d37677c..83bb728 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -7,6 +7,9 @@
//
//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
+
#include "AMDKernelCodeT.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
@@ -27,6 +30,12 @@ public:
StringRef ArchName) = 0;
virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) = 0;
+
+ virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0;
+
+ virtual void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) = 0;
+
+ virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0;
};
class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer {
@@ -41,6 +50,12 @@ public:
StringRef ArchName) override;
void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
+
+ void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+
+ void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
+
+ void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
};
class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer {
@@ -72,6 +87,12 @@ public:
void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
+ void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
+
+ void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
+
+ void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
};
}
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index e683498..3c1142d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -37,7 +37,6 @@ class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
const MCRegisterInfo &MRI;
public:
-
R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri)
: MCII(mcii), MRI(mri) { }
@@ -50,8 +49,8 @@ public:
uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
-private:
+private:
void EmitByte(unsigned int byte, raw_ostream &OS) const;
void Emit(uint32_t value, raw_ostream &OS) const;
@@ -59,7 +58,6 @@ private:
unsigned getHWRegChan(unsigned reg) const;
unsigned getHWReg(unsigned regNo) const;
-
};
} // End anonymous namespace
@@ -83,7 +81,7 @@ enum FCInstr {
MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
- MCContext &Ctx) {
+ MCContext &Ctx) {
return new R600MCCodeEmitter(MCII, MRI);
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 65a0eeb..9eb3dad 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -36,7 +36,6 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter {
void operator=(const SIMCCodeEmitter &) = delete;
const MCInstrInfo &MCII;
const MCRegisterInfo &MRI;
- MCContext &Ctx;
/// \brief Can this operand also contain immediate values?
bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
@@ -47,7 +46,7 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter {
public:
SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
MCContext &ctx)
- : MCII(mcii), MRI(mri), Ctx(ctx) { }
+ : MCII(mcii), MRI(mri) { }
~SIMCCodeEmitter() override {}
@@ -250,17 +249,7 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
if (MO.isExpr()) {
const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr());
- MCFixupKind Kind;
- const MCSymbol *Sym =
- Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
-
- if (&Expr->getSymbol() == Sym) {
- // Add the offset to the beginning of the constant values.
- Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text;
- } else {
- // This is used for constant data stored in .rodata.
- Kind = (MCFixupKind)AMDGPU::fixup_si_rodata;
- }
+ MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_rodata;
Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc()));
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/Processors.td b/contrib/llvm/lib/Target/AMDGPU/Processors.td
index d9a0723..a1584a2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Processors.td
+++ b/contrib/llvm/lib/Target/AMDGPU/Processors.td
@@ -142,3 +142,7 @@ def : ProcessorModel<"carrizo", SIQuarterSpeedModel,
def : ProcessorModel<"fiji", SIQuarterSpeedModel,
[FeatureVolcanicIslands, FeatureISAVersion8_0_1]
>;
+
+def : ProcessorModel<"stoney", SIQuarterSpeedModel,
+ [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
+>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index c8f37f6..bd80bb2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -405,8 +405,8 @@ private:
if (MO.isReg() && MO.isInternalRead())
MO.setIsInternalRead(false);
}
- getLiteral(BI, Literals);
- ClauseContent.push_back(BI);
+ getLiteral(&*BI, Literals);
+ ClauseContent.push_back(&*BI);
}
I = BI;
DeleteMI->eraseFromParent();
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 4e4d554..124a9c6 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -190,6 +190,10 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM,
setSchedulingPreference(Sched::Source);
}
+static inline bool isEOP(MachineBasicBlock::iterator I) {
+ return std::next(I)->getOpcode() == AMDGPU::RETURN;
+}
+
MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
MachineInstr * MI, MachineBasicBlock * BB) const {
MachineFunction * MF = BB->getParent();
@@ -276,12 +280,18 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
- unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
-
BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
.addOperand(MI->getOperand(0))
.addOperand(MI->getOperand(1))
- .addImm(EOP); // Set End of program bit
+ .addImm(isEOP(I)); // Set End of program bit
+ break;
+ }
+ case AMDGPU::RAT_STORE_TYPED_eg: {
+ BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
+ .addOperand(MI->getOperand(0))
+ .addOperand(MI->getOperand(1))
+ .addOperand(MI->getOperand(2))
+ .addImm(isEOP(I)); // Set End of program bit
break;
}
@@ -539,7 +549,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
}
}
}
- bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
+ bool EOP = isEOP(I);
if (!EOP && !isLastInstructionOfItsType)
return BB;
unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
@@ -946,6 +956,8 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDValue Arg = Op.getOperand(0);
SDLoc DL(Op);
+
+ // TODO: Should this propagate fast-math-flags?
SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
DAG.getNode(ISD::FADD, DL, VT,
DAG.getNode(ISD::FMUL, DL, VT, Arg,
@@ -1936,6 +1948,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
Arg->getOperand(0).getOperand(Element));
}
}
+ break;
}
case ISD::SELECT_CC: {
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index 855fa9f..8b6eea1 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -922,7 +922,7 @@ bool
R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
unsigned NumCyles,
unsigned ExtraPredCycles,
- const BranchProbability &Probability) const{
+ BranchProbability Probability) const{
return true;
}
@@ -933,14 +933,14 @@ R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
MachineBasicBlock &FMBB,
unsigned NumFCycles,
unsigned ExtraFCycles,
- const BranchProbability &Probability) const {
+ BranchProbability Probability) const {
return true;
}
bool
R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
unsigned NumCyles,
- const BranchProbability &Probability)
+ BranchProbability Probability)
const {
return true;
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
index dee4c2b..e7251c3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -174,18 +174,18 @@ namespace llvm {
bool
isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
- const BranchProbability &Probability) const override;
+ BranchProbability Probability) const override;
bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
unsigned ExtraPredCycles,
- const BranchProbability &Probability) const override ;
+ BranchProbability Probability) const override ;
bool
isProfitableToIfCvt(MachineBasicBlock &TMBB,
unsigned NumTCycles, unsigned ExtraTCycles,
MachineBasicBlock &FMBB,
unsigned NumFCycles, unsigned ExtraFCycles,
- const BranchProbability &Probability) const override;
+ BranchProbability Probability) const override;
bool DefinesPredicate(MachineInstr *MI,
std::vector<MachineOperand> &Pred) const override;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
index 7beed09..33ef6a4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -1655,7 +1655,7 @@ def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>;
// ISel Patterns
//===----------------------------------------------------------------------===//
-// CND*_INT Pattterns for f32 True / False values
+// CND*_INT Patterns for f32 True / False values
class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat <
(selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc),
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 0c06ccc..5efb3b9 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -318,7 +318,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
MRI = &(Fn.getRegInfo());
for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
MBB != MBBe; ++MBB) {
- MachineBasicBlock *MB = MBB;
+ MachineBasicBlock *MB = &*MBB;
PreviousRegSeq.clear();
PreviousRegSeqByReg.clear();
PreviousRegSeqByUndefCount.clear();
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index deee5bc..2126961 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -81,11 +81,11 @@ private:
int LastDstChan = -1;
do {
bool isTrans = false;
- int BISlot = getSlot(BI);
+ int BISlot = getSlot(&*BI);
if (LastDstChan >= BISlot)
isTrans = true;
LastDstChan = BISlot;
- if (TII->isPredicated(BI))
+ if (TII->isPredicated(&*BI))
continue;
int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
@@ -95,7 +95,7 @@ private:
continue;
}
unsigned Dst = BI->getOperand(DstIdx).getReg();
- if (isTrans || TII->isTransOnly(BI)) {
+ if (isTrans || TII->isTransOnly(&*BI)) {
Result[Dst] = AMDGPU::PS;
continue;
}
@@ -149,7 +149,7 @@ private:
public:
// Ctor.
R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI)
- : VLIWPacketizerList(MF, MLI, true),
+ : VLIWPacketizerList(MF, MLI, nullptr),
TII(static_cast<const R600InstrInfo *>(
MF.getSubtarget().getInstrInfo())),
TRI(TII->getRegisterInfo()) {
@@ -162,14 +162,14 @@ public:
}
// ignorePseudoInstruction - Ignore bundling of pseudo instructions.
- bool ignorePseudoInstruction(MachineInstr *MI,
- MachineBasicBlock *MBB) override {
+ bool ignorePseudoInstruction(const MachineInstr *MI,
+ const MachineBasicBlock *MBB) override {
return false;
}
// isSoloInstruction - return true if instruction MI can not be packetized
// with any other instruction, which means that MI itself is a packet.
- bool isSoloInstruction(MachineInstr *MI) override {
+ bool isSoloInstruction(const MachineInstr *MI) override {
if (TII->isVector(*MI))
return true;
if (!TII->isALUInstr(MI->getOpcode()))
@@ -375,7 +375,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
// instruction stream until we find the nearest boundary.
MachineBasicBlock::iterator I = RegionEnd;
for(;I != MBB->begin(); --I, --RemainingCount) {
- if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn))
+ if (TII->isSchedulingBoundary(&*std::prev(I), &*MBB, Fn))
break;
}
I = MBB->begin();
@@ -392,7 +392,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
continue;
}
- Packetizer.PacketizeMIs(MBB, I, RegionEnd);
+ Packetizer.PacketizeMIs(&*MBB, &*I, RegionEnd);
RegionEnd = I;
}
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
index 9713e60..4f8a129 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -35,7 +35,7 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
/// \brief get the register class of the specified type to use in the
/// CFGStructurizer
- const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
+ const TargetRegisterClass *getCFGStructurizerRegClass(MVT VT) const;
const RegClassWeight &
getRegClassWeight(const TargetRegisterClass *RC) const override;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index ccfbf1b..fa4d24a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -312,11 +312,10 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end())
Preds.push_back(*PI);
}
- BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", nullptr, DT,
- LI, false);
+ BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false);
}
- CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt());
+ CallInst::Create(EndCf, popSaved(), "", &*BB->getFirstInsertionPt());
}
/// \brief Annotate the control flow with intrinsics so the backend can
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
index 4c32639..7f79dd3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -37,7 +37,8 @@ enum {
MIMG = 1 << 18,
FLAT = 1 << 19,
WQM = 1 << 20,
- VGPRSpill = 1 << 21
+ VGPRSpill = 1 << 21,
+ VOPAsmPrefer32Bit = 1 << 22
};
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
index 5fe8d19..636750d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp
@@ -16,15 +16,9 @@
#include "AMDGPU.h"
#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
using namespace llvm;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 23502b4..96e37c5 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -82,22 +82,10 @@ using namespace llvm;
namespace {
class SIFixSGPRCopies : public MachineFunctionPass {
-
-private:
+public:
static char ID;
- const TargetRegisterClass *inferRegClassFromUses(const SIRegisterInfo *TRI,
- const MachineRegisterInfo &MRI,
- unsigned Reg,
- unsigned SubReg) const;
- const TargetRegisterClass *inferRegClassFromDef(const SIRegisterInfo *TRI,
- const MachineRegisterInfo &MRI,
- unsigned Reg,
- unsigned SubReg) const;
- bool isVGPRToSGPRCopy(const MachineInstr &Copy, const SIRegisterInfo *TRI,
- const MachineRegisterInfo &MRI) const;
-public:
- SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { }
+ SIFixSGPRCopies() : MachineFunctionPass(ID) { }
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -105,14 +93,23 @@ public:
return "SI Fix SGPR copies";
}
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
};
} // End anonymous namespace
+INITIALIZE_PASS(SIFixSGPRCopies, DEBUG_TYPE,
+ "SI Fix SGPR copies", false, false)
+
char SIFixSGPRCopies::ID = 0;
-FunctionPass *llvm::createSIFixSGPRCopiesPass(TargetMachine &tm) {
- return new SIFixSGPRCopies(tm);
+char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID;
+
+FunctionPass *llvm::createSIFixSGPRCopiesPass() {
+ return new SIFixSGPRCopies();
}
static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
@@ -128,77 +125,115 @@ static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
return false;
}
-/// This functions walks the use list of Reg until it finds an Instruction
-/// that isn't a COPY returns the register class of that instruction.
-/// \return The register defined by the first non-COPY instruction.
-const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses(
- const SIRegisterInfo *TRI,
- const MachineRegisterInfo &MRI,
- unsigned Reg,
- unsigned SubReg) const {
-
- const TargetRegisterClass *RC
- = TargetRegisterInfo::isVirtualRegister(Reg) ?
- MRI.getRegClass(Reg) :
- TRI->getPhysRegClass(Reg);
-
- RC = TRI->getSubRegClass(RC, SubReg);
- for (MachineRegisterInfo::use_instr_iterator
- I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) {
- switch (I->getOpcode()) {
- case AMDGPU::COPY:
- RC = TRI->getCommonSubClass(RC, inferRegClassFromUses(TRI, MRI,
- I->getOperand(0).getReg(),
- I->getOperand(0).getSubReg()));
- break;
- }
- }
+static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
+getCopyRegClasses(const MachineInstr &Copy,
+ const SIRegisterInfo &TRI,
+ const MachineRegisterInfo &MRI) {
+ unsigned DstReg = Copy.getOperand(0).getReg();
+ unsigned SrcReg = Copy.getOperand(1).getReg();
+
+ const TargetRegisterClass *SrcRC =
+ TargetRegisterInfo::isVirtualRegister(SrcReg) ?
+ MRI.getRegClass(SrcReg) :
+ TRI.getPhysRegClass(SrcReg);
- return RC;
+ // We don't really care about the subregister here.
+ // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg());
+
+ const TargetRegisterClass *DstRC =
+ TargetRegisterInfo::isVirtualRegister(DstReg) ?
+ MRI.getRegClass(DstReg) :
+ TRI.getPhysRegClass(DstReg);
+
+ return std::make_pair(SrcRC, DstRC);
}
-const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromDef(
- const SIRegisterInfo *TRI,
- const MachineRegisterInfo &MRI,
- unsigned Reg,
- unsigned SubReg) const {
- if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
- const TargetRegisterClass *RC = TRI->getPhysRegClass(Reg);
- return TRI->getSubRegClass(RC, SubReg);
- }
- MachineInstr *Def = MRI.getVRegDef(Reg);
- if (Def->getOpcode() != AMDGPU::COPY) {
- return TRI->getSubRegClass(MRI.getRegClass(Reg), SubReg);
- }
+static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
+ const TargetRegisterClass *DstRC,
+ const SIRegisterInfo &TRI) {
+ return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC);
+}
- return inferRegClassFromDef(TRI, MRI, Def->getOperand(1).getReg(),
- Def->getOperand(1).getSubReg());
+static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
+ const TargetRegisterClass *DstRC,
+ const SIRegisterInfo &TRI) {
+ return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
}
-bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
- const SIRegisterInfo *TRI,
- const MachineRegisterInfo &MRI) const {
+// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
+//
+// SGPRx = ...
+// SGPRy = REG_SEQUENCE SGPRx, sub0 ...
+// VGPRz = COPY SGPRy
+//
+// ==>
+//
+// VGPRx = COPY SGPRx
+// VGPRz = REG_SEQUENCE VGPRx, sub0
+//
+// This exposes immediate folding opportunities when materializing 64-bit
+// immediates.
+static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
+ const SIRegisterInfo *TRI,
+ const SIInstrInfo *TII,
+ MachineRegisterInfo &MRI) {
+ assert(MI.isRegSequence());
+
+ unsigned DstReg = MI.getOperand(0).getReg();
+ if (!TRI->isSGPRClass(MRI.getRegClass(DstReg)))
+ return false;
- unsigned DstReg = Copy.getOperand(0).getReg();
- unsigned SrcReg = Copy.getOperand(1).getReg();
- unsigned SrcSubReg = Copy.getOperand(1).getSubReg();
+ if (!MRI.hasOneUse(DstReg))
+ return false;
- if (!TargetRegisterInfo::isVirtualRegister(DstReg)) {
- // If the destination register is a physical register there isn't really
- // much we can do to fix this.
+ MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg);
+ if (!CopyUse.isCopy())
return false;
- }
- const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
+ const TargetRegisterClass *SrcRC, *DstRC;
+ std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI);
- const TargetRegisterClass *SrcRC;
+ if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
+ return false;
- if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
- MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass)
+ // TODO: Could have multiple extracts?
+ unsigned SubReg = CopyUse.getOperand(1).getSubReg();
+ if (SubReg != AMDGPU::NoSubRegister)
return false;
- SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg);
- return TRI->isSGPRClass(DstRC) && TRI->hasVGPRs(SrcRC);
+ MRI.setRegClass(DstReg, DstRC);
+
+ // SGPRx = ...
+ // SGPRy = REG_SEQUENCE SGPRx, sub0 ...
+ // VGPRz = COPY SGPRy
+
+ // =>
+ // VGPRx = COPY SGPRx
+ // VGPRz = REG_SEQUENCE VGPRx, sub0
+
+ MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());
+
+ for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
+ unsigned SrcReg = MI.getOperand(I).getReg();
+ unsigned SrcSubReg = MI.getOperand(I).getReg();
+
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+ assert(TRI->isSGPRClass(SrcRC) &&
+ "Expected SGPR REG_SEQUENCE to only have SGPR inputs");
+
+ SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg);
+ const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC);
+
+ unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC);
+
+ BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg)
+ .addOperand(MI.getOperand(I));
+
+ MI.getOperand(I).setReg(TmpReg);
+ }
+
+ CopyUse.eraseFromParent();
+ return true;
}
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
@@ -207,40 +242,38 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
const SIInstrInfo *TII =
static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+ SmallVector<MachineInstr *, 16> Worklist;
+
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
MachineBasicBlock &MBB = *BI;
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
- I != E; ++I) {
+ I != E; ++I) {
MachineInstr &MI = *I;
- if (MI.getOpcode() == AMDGPU::COPY && isVGPRToSGPRCopy(MI, TRI, MRI)) {
- DEBUG(dbgs() << "Fixing VGPR -> SGPR copy:\n");
- DEBUG(MI.print(dbgs()));
- TII->moveToVALU(MI);
-
- }
switch (MI.getOpcode()) {
- default: continue;
- case AMDGPU::PHI: {
- DEBUG(dbgs() << "Fixing PHI: " << MI);
-
- for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
- const MachineOperand &Op = MI.getOperand(i);
- unsigned Reg = Op.getReg();
- const TargetRegisterClass *RC
- = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg());
+ default:
+ continue;
+ case AMDGPU::COPY: {
+ // If the destination register is a physical register there isn't really
+ // much we can do to fix this.
+ if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))
+ continue;
- MRI.constrainRegClass(Op.getReg(), RC);
- }
- unsigned Reg = MI.getOperand(0).getReg();
- const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg,
- MI.getOperand(0).getSubReg());
- if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) {
- MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass);
+ const TargetRegisterClass *SrcRC, *DstRC;
+ std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
+ if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
+ DEBUG(dbgs() << "Fixing VGPR -> SGPR copy: " << MI);
+ TII->moveToVALU(MI);
}
+ break;
+ }
+ case AMDGPU::PHI: {
+ DEBUG(dbgs() << "Fixing PHI: " << MI);
+ unsigned Reg = MI.getOperand(0).getReg();
if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
break;
@@ -310,8 +343,10 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
}
case AMDGPU::REG_SEQUENCE: {
if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
- !hasVGPROperands(MI, TRI))
+ !hasVGPROperands(MI, TRI)) {
+ foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
continue;
+ }
DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp
index 0c54446..8bda283 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp
@@ -7,9 +7,8 @@
//
//===----------------------------------------------------------------------===//
//
-/// \file
-/// SALU instructions ignore control flow, so we need to modify the live ranges
-/// of the registers they define in some cases.
+/// \file SALU instructions ignore the execution mask, so we need to modify the
+/// live ranges of the registers they define in some cases.
///
/// The main case we need to handle is when a def is used in one side of a
/// branch and not another. For example:
@@ -42,13 +41,15 @@
/// ENDIF
/// %use
///
-/// Adding this use will make the def live thoughout the IF branch, which is
+/// Adding this use will make the def live throughout the IF branch, which is
/// what we want.
#include "AMDGPU.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachinePostDominators.h"
@@ -79,9 +80,13 @@ public:
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<LiveIntervals>();
+ AU.addRequired<LiveVariables>();
+ AU.addPreserved<LiveVariables>();
+
AU.addRequired<MachinePostDominatorTree>();
+ AU.addPreserved<MachinePostDominatorTree>();
AU.setPreservesCFG();
+
MachineFunctionPass::getAnalysisUsage(AU);
}
};
@@ -90,7 +95,7 @@ public:
INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE,
"SI Fix SGPR Live Ranges", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(LiveVariables)
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE,
"SI Fix SGPR Live Ranges", false, false)
@@ -108,40 +113,48 @@ bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) {
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
MF.getSubtarget().getRegisterInfo());
- LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
- MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>();
- std::vector<std::pair<unsigned, LiveRange *>> SGPRLiveRanges;
+ bool MadeChange = false;
+
+ MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>();
+ SmallVector<unsigned, 16> SGPRLiveRanges;
+
+ LiveVariables *LV = &getAnalysis<LiveVariables>();
+ MachineBasicBlock *Entry = &MF.front();
- // First pass, collect all live intervals for SGPRs
- for (const MachineBasicBlock &MBB : MF) {
- for (const MachineInstr &MI : MBB) {
+ // Use a depth first order so that in SSA, we encounter all defs before
+ // uses. Once the defs of the block have been found, attempt to insert
+ // SGPR_USE instructions in successor blocks if required.
+ for (MachineBasicBlock *MBB : depth_first(Entry)) {
+ for (const MachineInstr &MI : *MBB) {
for (const MachineOperand &MO : MI.defs()) {
- if (MO.isImplicit())
- continue;
+ // We should never see a live out def of a physical register, so we also
+ // do not need to worry about implicit_defs().
unsigned Def = MO.getReg();
if (TargetRegisterInfo::isVirtualRegister(Def)) {
- if (TRI->isSGPRClass(MRI.getRegClass(Def)))
- SGPRLiveRanges.push_back(
- std::make_pair(Def, &LIS->getInterval(Def)));
- } else if (TRI->isSGPRClass(TRI->getPhysRegClass(Def))) {
- SGPRLiveRanges.push_back(
- std::make_pair(Def, &LIS->getRegUnit(Def)));
+ if (TRI->isSGPRClass(MRI.getRegClass(Def))) {
+ // Only consider defs that are live outs. We don't care about def /
+ // use within the same block.
+
+ // LiveVariables does not consider registers that are only used in a
+ // phi in a sucessor block as live out, unlike LiveIntervals.
+ //
+ // This is OK because SIFixSGPRCopies replaced any SGPR phis with
+ // VGPRs.
+ if (LV->isLiveOut(Def, *MBB))
+ SGPRLiveRanges.push_back(Def);
+ }
}
}
}
- }
- // Second pass fix the intervals
- for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
- BI != BE; ++BI) {
- MachineBasicBlock &MBB = *BI;
- if (MBB.succ_size() < 2)
+ if (MBB->succ_size() < 2)
continue;
- // We have structured control flow, so number of succesors should be two.
- assert(MBB.succ_size() == 2);
- MachineBasicBlock *SuccA = *MBB.succ_begin();
- MachineBasicBlock *SuccB = *(++MBB.succ_begin());
+ // We have structured control flow, so the number of successors should be
+ // two.
+ assert(MBB->succ_size() == 2);
+ MachineBasicBlock *SuccA = *MBB->succ_begin();
+ MachineBasicBlock *SuccB = *(++MBB->succ_begin());
MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB);
if (!NCD)
@@ -156,37 +169,51 @@ bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) {
NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(),
*(++NCD->succ_begin()));
}
- assert(SuccA && SuccB);
- for (std::pair<unsigned, LiveRange*> RegLR : SGPRLiveRanges) {
- unsigned Reg = RegLR.first;
- LiveRange *LR = RegLR.second;
-
- // FIXME: We could be smarter here. If the register is Live-In to
- // one block, but the other doesn't have any SGPR defs, then there
- // won't be a conflict. Also, if the branch decision is based on
- // a value in an SGPR, then there will be no conflict.
- bool LiveInToA = LIS->isLiveInToMBB(*LR, SuccA);
- bool LiveInToB = LIS->isLiveInToMBB(*LR, SuccB);
-
- if ((!LiveInToA && !LiveInToB) ||
- (LiveInToA && LiveInToB))
+
+ for (unsigned Reg : SGPRLiveRanges) {
+ // FIXME: We could be smarter here. If the register is Live-In to one
+ // block, but the other doesn't have any SGPR defs, then there won't be a
+ // conflict. Also, if the branch condition is uniform then there will be
+ // no conflict.
+ bool LiveInToA = LV->isLiveIn(Reg, *SuccA);
+ bool LiveInToB = LV->isLiveIn(Reg, *SuccB);
+
+ if (!LiveInToA && !LiveInToB) {
+ DEBUG(dbgs() << PrintReg(Reg, TRI, 0)
+ << " is live into neither successor\n");
continue;
+ }
+
+ if (LiveInToA && LiveInToB) {
+ DEBUG(dbgs() << PrintReg(Reg, TRI, 0)
+ << " is live into both successors\n");
+ continue;
+ }
// This interval is live in to one successor, but not the other, so
// we need to update its range so it is live in to both.
- DEBUG(dbgs() << "Possible SGPR conflict detected " << " in " << *LR <<
- " BB#" << SuccA->getNumber() << ", BB#" <<
- SuccB->getNumber() <<
- " with NCD = " << NCD->getNumber() << '\n');
+ DEBUG(dbgs() << "Possible SGPR conflict detected for "
+ << PrintReg(Reg, TRI, 0)
+ << " BB#" << SuccA->getNumber()
+ << ", BB#" << SuccB->getNumber()
+ << " with NCD = BB#" << NCD->getNumber() << '\n');
+
+ assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
+ "Not expecting to extend live range of physreg");
// FIXME: Need to figure out how to update LiveRange here so this pass
// will be able to preserve LiveInterval analysis.
- BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(),
- TII->get(AMDGPU::SGPR_USE))
- .addReg(Reg, RegState::Implicit);
- DEBUG(NCD->getFirstNonPHI()->dump());
+ MachineInstr *NCDSGPRUse =
+ BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(),
+ TII->get(AMDGPU::SGPR_USE))
+ .addReg(Reg, RegState::Implicit);
+
+ MadeChange = true;
+ LV->HandleVirtRegUse(Reg, NCD, NCDSGPRUse);
+
+ DEBUG(NCDSGPRUse->dump());
}
}
- return false;
+ return MadeChange;
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index c288725..02a3930 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -45,6 +45,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -164,8 +165,8 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
// Operand is not legal, so try to commute the instruction to
// see if this makes it possible to fold.
- unsigned CommuteIdx0;
- unsigned CommuteIdx1;
+ unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
+ unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1);
if (CanCommute) {
@@ -175,7 +176,16 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
OpNo = CommuteIdx0;
}
- if (!CanCommute || !TII->commuteInstruction(MI))
+ // One of operands might be an Imm operand, and OpNo may refer to it after
+ // the call of commuteInstruction() below. Such situations are avoided
+ // here explicitly as OpNo must be a register operand to be a candidate
+ // for memory folding.
+ if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
+ !MI->getOperand(CommuteIdx1).isReg()))
+ return false;
+
+ if (!CanCommute ||
+ !TII->commuteInstruction(MI, false, CommuteIdx0, CommuteIdx1))
return false;
if (!TII->isOperandLegal(MI, OpNo, OpToFold))
@@ -186,6 +196,110 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
return true;
}
+static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
+ unsigned UseOpIdx,
+ std::vector<FoldCandidate> &FoldList,
+ SmallVectorImpl<MachineInstr *> &CopiesToReplace,
+ const SIInstrInfo *TII, const SIRegisterInfo &TRI,
+ MachineRegisterInfo &MRI) {
+ const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
+
+ // FIXME: Fold operands with subregs.
+ if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) ||
+ UseOp.isImplicit())) {
+ return;
+ }
+
+ bool FoldingImm = OpToFold.isImm();
+ APInt Imm;
+
+ if (FoldingImm) {
+ unsigned UseReg = UseOp.getReg();
+ const TargetRegisterClass *UseRC
+ = TargetRegisterInfo::isVirtualRegister(UseReg) ?
+ MRI.getRegClass(UseReg) :
+ TRI.getPhysRegClass(UseReg);
+
+ Imm = APInt(64, OpToFold.getImm());
+
+ const MCInstrDesc &FoldDesc = TII->get(OpToFold.getParent()->getOpcode());
+ const TargetRegisterClass *FoldRC =
+ TRI.getRegClass(FoldDesc.OpInfo[0].RegClass);
+
+ // Split 64-bit constants into 32-bits for folding.
+ if (FoldRC->getSize() == 8 && UseOp.getSubReg()) {
+ if (UseRC->getSize() != 8)
+ return;
+
+ if (UseOp.getSubReg() == AMDGPU::sub0) {
+ Imm = Imm.getLoBits(32);
+ } else {
+ assert(UseOp.getSubReg() == AMDGPU::sub1);
+ Imm = Imm.getHiBits(32);
+ }
+ }
+
+ // In order to fold immediates into copies, we need to change the
+ // copy to a MOV.
+ if (UseMI->getOpcode() == AMDGPU::COPY) {
+ unsigned DestReg = UseMI->getOperand(0).getReg();
+ const TargetRegisterClass *DestRC
+ = TargetRegisterInfo::isVirtualRegister(DestReg) ?
+ MRI.getRegClass(DestReg) :
+ TRI.getPhysRegClass(DestReg);
+
+ unsigned MovOp = TII->getMovOpcode(DestRC);
+ if (MovOp == AMDGPU::COPY)
+ return;
+
+ UseMI->setDesc(TII->get(MovOp));
+ CopiesToReplace.push_back(UseMI);
+ }
+ }
+
+ // Special case for REG_SEQUENCE: We can't fold literals into
+ // REG_SEQUENCE instructions, so we have to fold them into the
+ // uses of REG_SEQUENCE.
+ if (UseMI->getOpcode() == AMDGPU::REG_SEQUENCE) {
+ unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
+ unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
+
+ for (MachineRegisterInfo::use_iterator
+ RSUse = MRI.use_begin(RegSeqDstReg),
+ RSE = MRI.use_end(); RSUse != RSE; ++RSUse) {
+
+ MachineInstr *RSUseMI = RSUse->getParent();
+ if (RSUse->getSubReg() != RegSeqDstSubReg)
+ continue;
+
+ foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
+ CopiesToReplace, TII, TRI, MRI);
+ }
+ return;
+ }
+
+ const MCInstrDesc &UseDesc = UseMI->getDesc();
+
+ // Don't fold into target independent nodes. Target independent opcodes
+ // don't have defined register classes.
+ if (UseDesc.isVariadic() ||
+ UseDesc.OpInfo[UseOpIdx].RegClass == -1)
+ return;
+
+ if (FoldingImm) {
+ MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
+ tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
+ return;
+ }
+
+ tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
+
+ // FIXME: We could try to change the instruction from 64-bit to 32-bit
+ // to enable more folding opportunites. The shrink operands pass
+ // already does this.
+ return;
+}
+
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
const SIInstrInfo *TII =
@@ -226,88 +340,36 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
OpToFold.getSubReg()))
continue;
+
+ // We need mutate the operands of new mov instructions to add implicit
+ // uses of EXEC, but adding them invalidates the use_iterator, so defer
+ // this.
+ SmallVector<MachineInstr *, 4> CopiesToReplace;
+
std::vector<FoldCandidate> FoldList;
for (MachineRegisterInfo::use_iterator
Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();
Use != E; ++Use) {
MachineInstr *UseMI = Use->getParent();
- const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo());
- // FIXME: Fold operands with subregs.
- if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) ||
- UseOp.isImplicit())) {
- continue;
- }
-
- APInt Imm;
-
- if (FoldingImm) {
- unsigned UseReg = UseOp.getReg();
- const TargetRegisterClass *UseRC
- = TargetRegisterInfo::isVirtualRegister(UseReg) ?
- MRI.getRegClass(UseReg) :
- TRI.getPhysRegClass(UseReg);
-
- Imm = APInt(64, OpToFold.getImm());
-
- // Split 64-bit constants into 32-bits for folding.
- if (UseOp.getSubReg()) {
- if (UseRC->getSize() != 8)
- continue;
-
- if (UseOp.getSubReg() == AMDGPU::sub0) {
- Imm = Imm.getLoBits(32);
- } else {
- assert(UseOp.getSubReg() == AMDGPU::sub1);
- Imm = Imm.getHiBits(32);
- }
- }
-
- // In order to fold immediates into copies, we need to change the
- // copy to a MOV.
- if (UseMI->getOpcode() == AMDGPU::COPY) {
- unsigned DestReg = UseMI->getOperand(0).getReg();
- const TargetRegisterClass *DestRC
- = TargetRegisterInfo::isVirtualRegister(DestReg) ?
- MRI.getRegClass(DestReg) :
- TRI.getPhysRegClass(DestReg);
-
- unsigned MovOp = TII->getMovOpcode(DestRC);
- if (MovOp == AMDGPU::COPY)
- continue;
-
- UseMI->setDesc(TII->get(MovOp));
- }
- }
-
- const MCInstrDesc &UseDesc = UseMI->getDesc();
-
- // Don't fold into target independent nodes. Target independent opcodes
- // don't have defined register classes.
- if (UseDesc.isVariadic() ||
- UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1)
- continue;
-
- if (FoldingImm) {
- MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
- tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII);
- continue;
- }
-
- tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII);
-
- // FIXME: We could try to change the instruction from 64-bit to 32-bit
- // to enable more folding opportunites. The shrink operands pass
- // already does this.
+ foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
+ CopiesToReplace, TII, TRI, MRI);
}
+ // Make sure we add EXEC uses to any new v_mov instructions created.
+ for (MachineInstr *Copy : CopiesToReplace)
+ Copy->addImplicitDefUseOperands(MF);
+
for (FoldCandidate &Fold : FoldList) {
if (updateOperand(Fold, TRI)) {
// Clear kill flags.
if (!Fold.isImm()) {
assert(Fold.OpToFold && Fold.OpToFold->isReg());
- Fold.OpToFold->setIsKill(false);
+ // FIXME: Probably shouldn't bother trying to fold if not an
+ // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
+ // copies.
+ MRI.clearKillFlags(Fold.OpToFold->getReg());
}
DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
Fold.UseOpNo << " of " << *Fold.UseMI << '\n');
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
new file mode 100644
index 0000000..6b3c81c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -0,0 +1,243 @@
+//===----------------------- SIFrameLowering.cpp --------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+#include "SIFrameLowering.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+
+using namespace llvm;
+
+
+static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
+ const MachineFrameInfo *FrameInfo) {
+ if (!FuncInfo->hasSpilledSGPRs())
+ return false;
+
+ if (FuncInfo->hasSpilledVGPRs())
+ return false;
+
+ for (int I = FrameInfo->getObjectIndexBegin(),
+ E = FrameInfo->getObjectIndexEnd(); I != E; ++I) {
+ if (!FrameInfo->isSpillSlotObjectIndex(I))
+ return false;
+ }
+
+ return true;
+}
+
+static ArrayRef<MCPhysReg> getAllSGPR128() {
+ return makeArrayRef(AMDGPU::SReg_128RegClass.begin(),
+ AMDGPU::SReg_128RegClass.getNumRegs());
+}
+
+static ArrayRef<MCPhysReg> getAllSGPRs() {
+ return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
+ AMDGPU::SGPR_32RegClass.getNumRegs());
+}
+
+void SIFrameLowering::emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ if (!MF.getFrameInfo()->hasStackObjects())
+ return;
+
+ assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ // If we only have SGPR spills, we won't actually be using scratch memory
+ // since these spill to VGPRs.
+ //
+ // FIXME: We should be cleaning up these unused SGPR spill frame indices
+ // somewhere.
+ if (hasOnlySGPRSpills(MFI, MF.getFrameInfo()))
+ return;
+
+ const SIInstrInfo *TII =
+ static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+ const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+
+ // We need to insert initialization of the scratch resource descriptor.
+ unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
+ assert(ScratchRsrcReg != AMDGPU::NoRegister);
+
+ unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+ assert(ScratchWaveOffsetReg != AMDGPU::NoRegister);
+
+ unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+
+ unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
+ if (ST.isAmdHsaOS()) {
+ PreloadedPrivateBufferReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+ }
+
+ // If we reserved the original input registers, we don't need to copy to the
+ // reserved registers.
+ if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
+ // We should always reserve these 5 registers at the same time.
+ assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg &&
+ "scratch wave offset and private segment buffer inconsistent");
+ return;
+ }
+
+
+ // We added live-ins during argument lowering, but since they were not used
+ // they were deleted. We're adding the uses now, so add them back.
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
+ MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+
+ if (ST.isAmdHsaOS()) {
+ MRI.addLiveIn(PreloadedPrivateBufferReg);
+ MBB.addLiveIn(PreloadedPrivateBufferReg);
+ }
+
+ // We reserved the last registers for this. Shift it down to the end of those
+ // which were actually used.
+ //
+ // FIXME: It might be safer to use a pseudoregister before replacement.
+
+ // FIXME: We should be able to eliminate unused input registers. We only
+ // cannot do this for the resources required for scratch access. For now we
+ // skip over user SGPRs and may leave unused holes.
+
+ // We find the resource first because it has an alignment requirement.
+ if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
+ // Skip the last 2 elements because the last one is reserved for VCC, and
+ // this is the 2nd to last element already.
+ for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
+ // Pick the first unallocated one. Make sure we don't clobber the other
+ // reserved input we needed.
+ if (!MRI.isPhysRegUsed(Reg)) {
+ assert(MRI.isAllocatable(Reg));
+ MRI.replaceRegWith(ScratchRsrcReg, Reg);
+ ScratchRsrcReg = Reg;
+ MFI->setScratchRSrcReg(ScratchRsrcReg);
+ break;
+ }
+ }
+ }
+
+ if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ // Skip the last 2 elements because the last one is reserved for VCC, and
+ // this is the 2nd to last element already.
+ unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
+ for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) {
+ // Pick the first unallocated SGPR. Be careful not to pick an alias of the
+ // scratch descriptor, since we haven’t added its uses yet.
+ if (!MRI.isPhysRegUsed(Reg)) {
+ assert(MRI.isAllocatable(Reg) &&
+ !TRI->isSubRegisterEq(ScratchRsrcReg, Reg));
+
+ MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
+ ScratchWaveOffsetReg = Reg;
+ MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+ break;
+ }
+ }
+ }
+
+
+ assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
+
+ const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
+ MachineBasicBlock::iterator I = MBB.begin();
+ DebugLoc DL;
+
+ if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
+ // Make sure we emit the copy for the offset first. We may have chosen to copy
+ // the buffer resource into a register that aliases the input offset register.
+ BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg)
+ .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
+ }
+
+ if (ST.isAmdHsaOS()) {
+ // Insert copies from argument register.
+ assert(
+ !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) &&
+ !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg));
+
+ unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+ unsigned Rsrc23 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3);
+
+ unsigned Lo = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub0_sub1);
+ unsigned Hi = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub2_sub3);
+
+ const MCInstrDesc &SMovB64 = TII->get(AMDGPU::S_MOV_B64);
+
+ BuildMI(MBB, I, DL, SMovB64, Rsrc01)
+ .addReg(Lo, RegState::Kill);
+ BuildMI(MBB, I, DL, SMovB64, Rsrc23)
+ .addReg(Hi, RegState::Kill);
+ } else {
+ unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+ unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+ unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+ unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+
+ // Use relocations to get the pointer, and setup the other bits manually.
+ uint64_t Rsrc23 = TII->getScratchRsrcWords23();
+ BuildMI(MBB, I, DL, SMovB32, Rsrc0)
+ .addExternalSymbol("SCRATCH_RSRC_DWORD0")
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(MBB, I, DL, SMovB32, Rsrc1)
+ .addExternalSymbol("SCRATCH_RSRC_DWORD1")
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(MBB, I, DL, SMovB32, Rsrc2)
+ .addImm(Rsrc23 & 0xffffffff)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(MBB, I, DL, SMovB32, Rsrc3)
+ .addImm(Rsrc23 >> 32)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ }
+
+ // Make the register selected live throughout the function.
+ for (MachineBasicBlock &OtherBB : MF) {
+ if (&OtherBB == &MBB)
+ continue;
+
+ OtherBB.addLiveIn(ScratchRsrcReg);
+ OtherBB.addLiveIn(ScratchWaveOffsetReg);
+ }
+}
+
+void SIFrameLowering::processFunctionBeforeFrameFinalized(
+ MachineFunction &MF,
+ RegScavenger *RS) const {
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+
+ if (!MFI->hasStackObjects())
+ return;
+
+ bool MayNeedScavengingEmergencySlot = MFI->hasStackObjects();
+
+ assert((RS || !MayNeedScavengingEmergencySlot) &&
+ "RegScavenger required if spilling");
+
+ if (MayNeedScavengingEmergencySlot) {
+ int ScavengeFI = MFI->CreateSpillStackObject(
+ AMDGPU::SGPR_32RegClass.getSize(),
+ AMDGPU::SGPR_32RegClass.getAlignment());
+ RS->addScavengingFrameIndex(ScavengeFI);
+ }
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h
new file mode 100644
index 0000000..a9152fd
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -0,0 +1,34 @@
+//===--------------------- SIFrameLowering.h --------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H
+#define LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H
+
+#include "AMDGPUFrameLowering.h"
+
+namespace llvm {
+
+class SIFrameLowering final : public AMDGPUFrameLowering {
+public:
+ SIFrameLowering(StackDirection D, unsigned StackAl, int LAO,
+ unsigned TransAl = 1) :
+ AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
+ ~SIFrameLowering() override {}
+
+ void emitPrologue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const override;
+
+ void processFunctionBeforeFrameFinalized(
+ MachineFunction &MF,
+ RegScavenger *RS = nullptr) const override;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c2db9ff..0e043cb 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -20,6 +20,7 @@
#include "SIISelLowering.h"
#include "AMDGPU.h"
+#include "AMDGPUDiagnosticInfoUnsupported.h"
#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
@@ -51,6 +52,9 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
+ addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
+ addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
+
addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
@@ -103,6 +107,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+ setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
@@ -155,13 +160,30 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
for (MVT VT : MVT::fp_valuetypes())
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
+
setTruncStoreAction(MVT::i64, MVT::i32, Expand);
setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
+
+ setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
+
+ setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
+ setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
+
setOperationAction(ISD::LOAD, MVT::i1, Custom);
+ setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
+
+ setOperationAction(ISD::STORE, MVT::v2i64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
+
+ setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
+
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
@@ -173,9 +195,14 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
setOperationAction(ISD::SELECT, MVT::i1, Promote);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
+
+
+ setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
+
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
- for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) {
+ for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch(Op) {
case ISD::LOAD:
@@ -186,6 +213,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
case ISD::INSERT_VECTOR_ELT:
case ISD::INSERT_SUBVECTOR:
case ISD::EXTRACT_SUBVECTOR:
+ case ISD::SCALAR_TO_VECTOR:
break;
case ISD::CONCAT_VECTORS:
setOperationAction(Op, VT, Custom);
@@ -197,6 +225,22 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
}
}
+ // Most operations are naturally 32-bit vector operations. We only support
+ // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
+ for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
+ }
+
if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
@@ -261,6 +305,41 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
}
+bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
+ // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
+ // additionally can do r + r + i with addr64. 32-bit has more addressing
+ // mode options. Depending on the resource constant, it can also do
+ // (i64 r0) + (i32 r1) * (i14 i).
+ //
+ // Private arrays end up using a scratch buffer most of the time, so also
+ // assume those use MUBUF instructions. Scratch loads / stores are currently
+ // implemented as mubuf instructions with offen bit set, so slightly
+ // different than the normal addr64.
+ if (!isUInt<12>(AM.BaseOffs))
+ return false;
+
+ // FIXME: Since we can split immediate into soffset and immediate offset,
+ // would it make sense to allow any immediate?
+
+ switch (AM.Scale) {
+ case 0: // r + i or just i, depending on HasBaseReg.
+ return true;
+ case 1:
+ return true; // We have r + r or r + i.
+ case 2:
+ if (AM.HasBaseReg) {
+ // Reject 2 * r + r.
+ return false;
+ }
+
+ // Allow 2 * r as r + r
+ // Or 2 * r + i is allowed as r + r + i.
+ return true;
+ default: // Don't allow n * r
+ return false;
+ }
+}
+
bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS) const {
@@ -269,7 +348,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
return false;
switch (AS) {
- case AMDGPUAS::GLOBAL_ADDRESS:
+ case AMDGPUAS::GLOBAL_ADDRESS: {
if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
// Assume the we will use FLAT for all global memory accesses
// on VI.
@@ -282,51 +361,51 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// because it has never been validated.
return isLegalFlatAddressingMode(AM);
}
- // fall-through
- case AMDGPUAS::PRIVATE_ADDRESS:
- case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
- case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: {
- // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
- // additionally can do r + r + i with addr64. 32-bit has more addressing
- // mode options. Depending on the resource constant, it can also do
- // (i64 r0) + (i32 r1) * (i14 i).
- //
- // SMRD instructions have an 8-bit, dword offset.
- //
- // Assume nonunifom access, since the address space isn't enough to know
- // what instruction we will use, and since we don't know if this is a load
- // or store and scalar stores are only available on VI.
- //
- // We also know if we are doing an extload, we can't do a scalar load.
- //
- // Private arrays end up using a scratch buffer most of the time, so also
- // assume those use MUBUF instructions. Scratch loads / stores are currently
- // implemented as mubuf instructions with offen bit set, so slightly
- // different than the normal addr64.
- if (!isUInt<12>(AM.BaseOffs))
- return false;
- // FIXME: Since we can split immediate into soffset and immediate offset,
- // would it make sense to allow any immediate?
+ return isLegalMUBUFAddressingMode(AM);
+ }
+ case AMDGPUAS::CONSTANT_ADDRESS: {
+ // If the offset isn't a multiple of 4, it probably isn't going to be
+ // correctly aligned.
+ if (AM.BaseOffs % 4 != 0)
+ return isLegalMUBUFAddressingMode(AM);
+
+ // There are no SMRD extloads, so if we have to do a small type access we
+ // will use a MUBUF load.
+ // FIXME?: We also need to do this if unaligned, but we don't know the
+ // alignment here.
+ if (DL.getTypeStoreSize(Ty) < 4)
+ return isLegalMUBUFAddressingMode(AM);
+
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ // SMRD instructions have an 8-bit, dword offset on SI.
+ if (!isUInt<8>(AM.BaseOffs / 4))
+ return false;
+ } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
+ // On CI+, this can also be a 32-bit literal constant offset. If it fits
+ // in 8-bits, it can use a smaller encoding.
+ if (!isUInt<32>(AM.BaseOffs / 4))
+ return false;
+ } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // On VI, these use the SMEM format and the offset is 20-bit in bytes.
+ if (!isUInt<20>(AM.BaseOffs))
+ return false;
+ } else
+ llvm_unreachable("unhandled generation");
- switch (AM.Scale) {
- case 0: // r + i or just i, depending on HasBaseReg.
+ if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
return true;
- case 1:
- return true; // We have r + r or r + i.
- case 2:
- if (AM.HasBaseReg) {
- // Reject 2 * r + r.
- return false;
- }
- // Allow 2 * r as r + r
- // Or 2 * r + i is allowed as r + r + i.
+ if (AM.Scale == 1 && AM.HasBaseReg)
return true;
- default: // Don't allow n * r
- return false;
- }
+
+ return false;
}
+
+ case AMDGPUAS::PRIVATE_ADDRESS:
+ case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
+ return isLegalMUBUFAddressingMode(AM);
+
case AMDGPUAS::LOCAL_ADDRESS:
case AMDGPUAS::REGION_ADDRESS: {
// Basic, single offset DS instructions allow a 16-bit unsigned immediate
@@ -374,7 +453,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
// ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
// aligned, 8 byte access in a single operation using ds_read2/write2_b32
// with adjacent offsets.
- return Align % 4 == 0;
+ bool AlignedBy4 = (Align % 4 == 0);
+ if (IsFast)
+ *IsFast = AlignedBy4;
+ return AlignedBy4;
}
// Smaller than dword value must be aligned.
@@ -411,6 +493,32 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
return MVT::Other;
}
+static bool isFlatGlobalAddrSpace(unsigned AS) {
+ return AS == AMDGPUAS::GLOBAL_ADDRESS ||
+ AS == AMDGPUAS::FLAT_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS;
+}
+
+bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
+ unsigned DestAS) const {
+ return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
+}
+
+
+bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
+ const MemSDNode *MemNode = cast<MemSDNode>(N);
+ const Value *Ptr = MemNode->getMemOperand()->getValue();
+
+ // UndefValue means this is a load of a kernel input. These are uniform.
+ // Sometimes LDS instructions have constant pointers
+ if (isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || isa<Constant>(Ptr) ||
+ isa<GlobalValue>(Ptr))
+ return true;
+
+ const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
+ return I && I->getMetadata("amdgpu.uniform");
+}
+
TargetLoweringBase::LegalizeTypeAction
SITargetLowering::getPreferredVectorAction(EVT VT) const {
if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
@@ -426,12 +534,6 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
return TII->isInlineConstant(Imm);
}
-static EVT toIntegerVT(EVT VT) {
- if (VT.isVector())
- return VT.changeVectorElementTypeToInteger();
- return MVT::getIntegerVT(VT.getSizeInBits());
-}
-
SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
SDLoc SL, SDValue Chain,
unsigned Offset, bool Signed) const {
@@ -439,7 +541,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
MachineFunction &MF = DAG.getMachineFunction();
const SIRegisterInfo *TRI =
static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
- unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
+ unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
Type *Ty = VT.getTypeForEVT(*DAG.getContext());
@@ -455,30 +557,10 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
unsigned Align = DL.getABITypeAlignment(Ty);
- if (VT != MemVT && VT.isFloatingPoint()) {
- // Do an integer load and convert.
- // FIXME: This is mostly because load legalization after type legalization
- // doesn't handle FP extloads.
- assert(VT.getScalarType() == MVT::f32 &&
- MemVT.getScalarType() == MVT::f16);
-
- EVT IVT = toIntegerVT(VT);
- EVT MemIVT = toIntegerVT(MemVT);
- SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD,
- IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT,
- false, // isVolatile
- true, // isNonTemporal
- true, // isInvariant
- Align); // Alignment
- SDValue Ops[] = {
- DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load),
- Load.getValue(1)
- };
-
- return DAG.getMergeValues(Ops, SL);
- }
-
ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ if (MemVT.isFloatingPoint())
+ ExtTy = ISD::EXTLOAD;
+
return DAG.getLoad(ISD::UNINDEXED, ExtTy,
VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
false, // isVolatile
@@ -497,8 +579,16 @@ SDValue SITargetLowering::LowerFormalArguments(
MachineFunction &MF = DAG.getMachineFunction();
FunctionType *FType = MF.getFunction()->getFunctionType();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+
+ if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) {
+ const Function *Fn = MF.getFunction();
+ DiagnosticInfoUnsupported NoGraphicsHSA(*Fn, "non-compute shaders with HSA");
+ DAG.getContext()->diagnose(NoGraphicsHSA);
+ return SDValue();
+ }
- assert(CallConv == CallingConv::C);
+ // FIXME: We currently assume all calling conventions are kernels.
SmallVector<ISD::InputArg, 16> Splits;
BitVector Skipped(Ins.size());
@@ -513,7 +603,7 @@ SDValue SITargetLowering::LowerFormalArguments(
assert((PSInputNum <= 15) && "Too many PS inputs!");
if (!Arg.Used) {
- // We can savely skip PS inputs
+ // We can safely skip PS inputs
Skipped.set(i);
++PSInputNum;
continue;
@@ -530,7 +620,7 @@ SDValue SITargetLowering::LowerFormalArguments(
// We REALLY want the ORIGINAL number of vertex elements here, e.g. a
// three or five element vertex only needs three or five registers,
- // NOT four or eigth.
+ // NOT four or eight.
Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
unsigned NumElements = ParamType->getVectorNumElements();
@@ -556,41 +646,30 @@ SDValue SITargetLowering::LowerFormalArguments(
CCInfo.AllocateReg(AMDGPU::VGPR1);
}
- // The pointer to the list of arguments is stored in SGPR0, SGPR1
- // The pointer to the scratch buffer is stored in SGPR2, SGPR3
- if (Info->getShaderType() == ShaderType::COMPUTE) {
- if (Subtarget->isAmdHsaOS())
- Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers.
- else
- Info->NumUserSGPRs = 4;
-
- unsigned InputPtrReg =
- TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
- unsigned InputPtrRegLo =
- TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0);
- unsigned InputPtrRegHi =
- TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1);
-
- unsigned ScratchPtrReg =
- TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
- unsigned ScratchPtrRegLo =
- TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0);
- unsigned ScratchPtrRegHi =
- TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1);
-
- CCInfo.AllocateReg(InputPtrRegLo);
- CCInfo.AllocateReg(InputPtrRegHi);
- CCInfo.AllocateReg(ScratchPtrRegLo);
- CCInfo.AllocateReg(ScratchPtrRegHi);
- MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
- MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass);
- }
-
if (Info->getShaderType() == ShaderType::COMPUTE) {
getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
Splits);
}
+ // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
+ if (Info->hasPrivateSegmentBuffer()) {
+ unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
+ MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
+ CCInfo.AllocateReg(PrivateSegmentBufferReg);
+ }
+
+ if (Info->hasDispatchPtr()) {
+ unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
+ MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
+ CCInfo.AllocateReg(DispatchPtrReg);
+ }
+
+ if (Info->hasKernargSegmentPtr()) {
+ unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
+ MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
+ CCInfo.AllocateReg(InputPtrReg);
+ }
+
AnalyzeFormalArguments(CCInfo, Splits);
SmallVector<SDValue, 16> Chains;
@@ -617,7 +696,7 @@ SDValue SITargetLowering::LowerFormalArguments(
Offset, Ins[i].Flags.isSExt());
Chains.push_back(Arg.getValue(1));
- const PointerType *ParamTy =
+ auto *ParamTy =
dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
@@ -678,10 +757,113 @@ SDValue SITargetLowering::LowerFormalArguments(
InVals.push_back(Val);
}
- if (Info->getShaderType() != ShaderType::COMPUTE) {
- unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>(
- AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));
- Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
+ // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
+ // these from the dispatch pointer.
+
+ // Start adding system SGPRs.
+ if (Info->hasWorkGroupIDX()) {
+ unsigned Reg = Info->addWorkGroupIDX();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ } else
+ llvm_unreachable("work group id x is always enabled");
+
+ if (Info->hasWorkGroupIDY()) {
+ unsigned Reg = Info->addWorkGroupIDY();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasWorkGroupIDZ()) {
+ unsigned Reg = Info->addWorkGroupIDZ();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasWorkGroupInfo()) {
+ unsigned Reg = Info->addWorkGroupInfo();
+ MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasPrivateSegmentWaveByteOffset()) {
+ // Scratch wave offset passed in system SGPR.
+ unsigned PrivateSegmentWaveByteOffsetReg
+ = Info->addPrivateSegmentWaveByteOffset();
+
+ MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
+ CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
+ }
+
+ // Now that we've figured out where the scratch register inputs are, see if
+ // should reserve the arguments and use them directly.
+
+ bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
+
+ if (ST.isAmdHsaOS()) {
+ // TODO: Assume we will spill without optimizations.
+ if (HasStackObjects) {
+ // If we have stack objects, we unquestionably need the private buffer
+ // resource. For the HSA ABI, this will be the first 4 user SGPR
+ // inputs. We can reserve those and use them directly.
+
+ unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+ Info->setScratchRSrcReg(PrivateSegmentBufferReg);
+
+ unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+ } else {
+ unsigned ReservedBufferReg
+ = TRI->reservedPrivateSegmentBufferReg(MF);
+ unsigned ReservedOffsetReg
+ = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+
+ // We tentatively reserve the last registers (skipping the last two
+ // which may contain VCC). After register allocation, we'll replace
+ // these with the ones immediately after those which were really
+ // allocated. In the prologue copies will be inserted from the argument
+ // to these reserved registers.
+ Info->setScratchRSrcReg(ReservedBufferReg);
+ Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+ }
+ } else {
+ unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
+
+ // Without HSA, relocations are used for the scratch pointer and the
+ // buffer resource setup is always inserted in the prologue. Scratch wave
+ // offset is still in an input SGPR.
+ Info->setScratchRSrcReg(ReservedBufferReg);
+
+ if (HasStackObjects) {
+ unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+ } else {
+ unsigned ReservedOffsetReg
+ = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+ Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+ }
+ }
+
+ if (Info->hasWorkItemIDX()) {
+ unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
+ MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ } else
+ llvm_unreachable("workitem id x should always be enabled");
+
+ if (Info->hasWorkItemIDY()) {
+ unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
+ MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ if (Info->hasWorkItemIDZ()) {
+ unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
+ MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ CCInfo.AllocateReg(Reg);
}
if (Chains.empty())
@@ -693,27 +875,11 @@ SDValue SITargetLowering::LowerFormalArguments(
MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
MachineInstr * MI, MachineBasicBlock * BB) const {
- MachineBasicBlock::iterator I = *MI;
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-
switch (MI->getOpcode()) {
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
case AMDGPU::BRANCH:
return BB;
- case AMDGPU::SI_RegisterStorePseudo: {
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- MachineInstrBuilder MIB =
- BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore),
- Reg);
- for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
- MIB.addOperand(MI->getOperand(i));
-
- MI->eraseFromParent();
- break;
- }
}
return BB;
}
@@ -944,20 +1110,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
const GlobalValue *GV = GSD->getGlobal();
MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace());
- SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
-
- SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
- DAG.getConstant(0, DL, MVT::i32));
- SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
- DAG.getConstant(1, DL, MVT::i32));
-
- SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue),
- PtrLo, GA);
- SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue),
- PtrHi, DAG.getConstant(0, DL, MVT::i32),
- SDValue(Lo.getNode(), 1));
- return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
+ return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA);
}
SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
@@ -977,6 +1131,18 @@ SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
// a glue result.
}
+SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
+ SDValue Op,
+ MVT VT,
+ unsigned Offset) const {
+ SDLoc SL(Op);
+ SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL,
+ DAG.getEntryNode(), Offset, false);
+ // The local size values will have the hi 16-bits as zero.
+ return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
+ DAG.getValueType(VT));
+}
+
SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
@@ -988,7 +1154,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDLoc DL(Op);
unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ // TODO: Should this propagate fast-math-flags?
+
switch (IntrinsicID) {
+ case Intrinsic::amdgcn_dispatch_ptr:
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
+ TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT);
+
case Intrinsic::r600_read_ngroups_x:
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
SI::KernelInputOffsets::NGROUPS_X, false);
@@ -1008,37 +1180,36 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
case Intrinsic::r600_read_local_size_x:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::LOCAL_SIZE_X, false);
+ return lowerImplicitZextParam(DAG, Op, MVT::i16,
+ SI::KernelInputOffsets::LOCAL_SIZE_X);
case Intrinsic::r600_read_local_size_y:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::LOCAL_SIZE_Y, false);
+ return lowerImplicitZextParam(DAG, Op, MVT::i16,
+ SI::KernelInputOffsets::LOCAL_SIZE_Y);
case Intrinsic::r600_read_local_size_z:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::LOCAL_SIZE_Z, false);
-
+ return lowerImplicitZextParam(DAG, Op, MVT::i16,
+ SI::KernelInputOffsets::LOCAL_SIZE_Z);
case Intrinsic::AMDGPU_read_workdim:
- return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- getImplicitParameterOffset(MFI, GRID_DIM), false);
-
+ // Really only 2 bits.
+ return lowerImplicitZextParam(DAG, Op, MVT::i8,
+ getImplicitParameterOffset(MFI, GRID_DIM));
case Intrinsic::r600_read_tgid_x:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT);
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
case Intrinsic::r600_read_tgid_y:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT);
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
case Intrinsic::r600_read_tgid_z:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT);
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
case Intrinsic::r600_read_tidig_x:
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT);
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
case Intrinsic::r600_read_tidig_y:
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT);
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
case Intrinsic::r600_read_tidig_z:
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT);
+ TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
case AMDGPUIntrinsic::SI_load_const: {
SDValue Ops[] = {
Op.getOperand(1),
@@ -1077,6 +1248,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
DAG.getConstant(2, DL, MVT::i32), // P0
Op.getOperand(1), Op.getOperand(2), Glue);
}
+ case AMDGPUIntrinsic::SI_packf16:
+ if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
+ return DAG.getUNDEF(MVT::i32);
+ return Op;
case AMDGPUIntrinsic::SI_fs_interp: {
SDValue IJ = Op.getOperand(4);
SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
@@ -1092,6 +1267,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
Op.getOperand(1), Op.getOperand(2), Glue);
}
+ case Intrinsic::amdgcn_interp_p1: {
+ SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
+ SDValue Glue = M0.getValue(1);
+ return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3), Glue);
+ }
+ case Intrinsic::amdgcn_interp_p2: {
+ SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
+ SDValue Glue = SDValue(M0.getNode(), 1);
+ return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
+ Glue);
+ }
default:
return AMDGPUTargetLowering::LowerOperation(Op, DAG);
}
@@ -1152,16 +1340,29 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
"Custom lowering for non-i32 vectors hasn't been implemented.");
unsigned NumElements = Op.getValueType().getVectorNumElements();
assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
+
switch (Load->getAddressSpace()) {
default: break;
+ case AMDGPUAS::CONSTANT_ADDRESS:
+ if (isMemOpUniform(Load))
+ break;
+ // Non-uniform loads will be selected to MUBUF instructions, so they
+ // have the same legalization requires ments as global and private
+ // loads.
+ //
+ // Fall-through
case AMDGPUAS::GLOBAL_ADDRESS:
case AMDGPUAS::PRIVATE_ADDRESS:
+ if (NumElements >= 8)
+ return SplitVectorLoad(Op, DAG);
+
// v4 loads are supported for private and global memory.
if (NumElements <= 4)
break;
// fall-through
case AMDGPUAS::LOCAL_ADDRESS:
- return ScalarizeVectorLoad(Op, DAG);
+ // If properly aligned, if we split we might be able to use ds_read_b64.
+ return SplitVectorLoad(Op, DAG);
}
}
@@ -1236,8 +1437,10 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
if (Unsafe) {
// Turn into multiply by the reciprocal.
// x / y -> x * (1.0 / y)
+ SDNodeFlags Flags;
+ Flags.setUnsafeAlgebra(true);
SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
- return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip);
+ return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags);
}
return SDValue();
@@ -1274,6 +1477,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+ // TODO: Should this propagate fast-math-flags?
+
r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
@@ -1379,7 +1584,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return Ret;
if (VT.isVector() && VT.getVectorNumElements() >= 8)
- return ScalarizeVectorStore(Op, DAG);
+ return SplitVectorStore(Op, DAG);
if (VT == MVT::i1)
return DAG.getTruncStore(Store->getChain(), DL,
@@ -1393,6 +1598,7 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT VT = Op.getValueType();
SDValue Arg = Op.getOperand(0);
+ // TODO: Should this propagate fast-math-flags?
SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
DAG.getNode(ISD::FMUL, DL, VT, Arg,
DAG.getConstantFP(0.5/M_PI, DL,
@@ -2125,9 +2331,14 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
- TII->legalizeOperands(MI);
- if (TII->isMIMG(MI->getOpcode())) {
+ if (TII->isVOP3(MI->getOpcode())) {
+ // Make sure constant bus requirements are respected.
+ TII->legalizeOperandsVOP3(MRI, MI);
+ return;
+ }
+
+ if (TII->isMIMG(*MI)) {
unsigned VReg = MI->getOperand(0).getReg();
unsigned Writemask = MI->getOperand(1).getImm();
unsigned BitsSet = 0;
@@ -2169,53 +2380,38 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
SDLoc DL,
SDValue Ptr) const {
const SIInstrInfo *TII =
- static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-#if 1
- // XXX - Workaround for moveToVALU not handling different register class
- // inserts for REG_SEQUENCE.
-
- // Build the half of the subregister with the constants.
- const SDValue Ops0[] = {
- DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
- buildSMovImm32(DAG, DL, 0),
- DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
- buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
- DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
- };
-
- SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
- MVT::v2i32, Ops0), 0);
-
- // Combine the constants and the pointer.
- const SDValue Ops1[] = {
- DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
- Ptr,
- DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
- SubRegHi,
- DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
- };
+ static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+
+ // Build the half of the subregister with the constants before building the
+ // full 128-bit register. If we are building multiple resource descriptors,
+ // this will allow CSEing of the 2-component register.
+ const SDValue Ops0[] = {
+ DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
+ buildSMovImm32(DAG, DL, 0),
+ DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+ buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
+ DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
+ };
- return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
-#else
- const SDValue Ops[] = {
- DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
- Ptr,
- DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32),
- buildSMovImm32(DAG, DL, 0),
- DAG.getTargetConstant(AMDGPU::sub2, MVT::i32),
- buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32),
- DAG.getTargetConstant(AMDGPU::sub3, MVT::i32)
- };
+ SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+ MVT::v2i32, Ops0), 0);
- return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
+ // Combine the constants and the pointer.
+ const SDValue Ops1[] = {
+ DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+ Ptr,
+ DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
+ SubRegHi,
+ DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
+ };
-#endif
+ return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
}
/// \brief Return a resource descriptor with the 'Add TID' bit enabled
-/// The TID (Thread ID) is multipled by the stride value (bits [61:48]
-/// of the resource descriptor) to create an offset, which is added to the
-/// resource ponter.
+/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
+/// of the resource descriptor) to create an offset, which is added to
+/// the resource pointer.
MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
SDLoc DL,
SDValue Ptr,
@@ -2248,15 +2444,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
}
-MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG,
- SDLoc DL,
- SDValue Ptr) const {
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-
- return buildRSRC(DAG, DL, Ptr, 0, TII->getScratchRsrcWords23());
-}
-
SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
const TargetRegisterClass *RC,
unsigned Reg, EVT VT) const {
@@ -2274,13 +2461,41 @@ std::pair<unsigned, const TargetRegisterClass *>
SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
MVT VT) const {
- if (Constraint == "r") {
- switch(VT.SimpleTy) {
- default: llvm_unreachable("Unhandled type for 'r' inline asm constraint");
- case MVT::i64:
- return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
- case MVT::i32:
+
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 's':
+ case 'r':
+ switch (VT.getSizeInBits()) {
+ default:
+ return std::make_pair(0U, nullptr);
+ case 32:
return std::make_pair(0U, &AMDGPU::SGPR_32RegClass);
+ case 64:
+ return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
+ case 128:
+ return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
+ case 256:
+ return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
+ }
+
+ case 'v':
+ switch (VT.getSizeInBits()) {
+ default:
+ return std::make_pair(0U, nullptr);
+ case 32:
+ return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
+ case 64:
+ return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
+ case 96:
+ return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
+ case 128:
+ return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
+ case 256:
+ return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
+ case 512:
+ return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
+ }
}
}
@@ -2301,3 +2516,16 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
}
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}
+
+SITargetLowering::ConstraintType
+SITargetLowering::getConstraintType(StringRef Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ default: break;
+ case 's':
+ case 'v':
+ return C_RegisterClass;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
index d84c32e..e2f8cb1 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -28,6 +28,9 @@ class SITargetLowering : public AMDGPUTargetLowering {
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
SelectionDAG &DAG) const override;
+ SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
+ MVT VT, unsigned Offset) const;
+
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
@@ -57,6 +60,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
bool isLegalFlatAddressingMode(const AddrMode &AM) const;
+ bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
public:
SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI);
@@ -76,6 +80,9 @@ public:
bool MemcpyStrSrc,
MachineFunction &MF) const override;
+ bool isMemOpUniform(const SDNode *N) const;
+ bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
+
TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(EVT VT) const override;
@@ -112,13 +119,10 @@ public:
SDValue Ptr,
uint32_t RsrcDword1,
uint64_t RsrcDword2And3) const;
- MachineSDNode *buildScratchRSRC(SelectionDAG &DAG,
- SDLoc DL,
- SDValue Ptr) const;
-
std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint, MVT VT) const override;
+ ConstraintType getConstraintType(StringRef Constraint) const override;
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const;
};
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
index 90a37f1..821aada 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -91,7 +91,8 @@ private:
bool isOpRelevant(MachineOperand &Op);
/// \brief Get register interval an operand affects.
- RegInterval getRegInterval(MachineOperand &Op);
+ RegInterval getRegInterval(const TargetRegisterClass *RC,
+ const MachineOperand &Reg) const;
/// \brief Handle instructions async components
void pushInstruction(MachineBasicBlock &MBB,
@@ -121,9 +122,13 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
const char *getPassName() const override {
- return "SI insert wait instructions";
+ return "SI insert wait instructions";
}
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
};
} // End anonymous namespace
@@ -138,9 +143,8 @@ FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
}
Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
-
- uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
- Counters Result;
+ uint64_t TSFlags = MI.getDesc().TSFlags;
+ Counters Result = { { 0, 0, 0 } };
Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
@@ -151,15 +155,22 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
// LGKM may uses larger values
if (TSFlags & SIInstrFlags::LGKM_CNT) {
- if (TII->isSMRD(MI.getOpcode())) {
-
- MachineOperand &Op = MI.getOperand(0);
- assert(Op.isReg() && "First LGKM operand must be a register!");
-
- unsigned Reg = Op.getReg();
- unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
- Result.Named.LGKM = Size > 4 ? 2 : 1;
-
+ if (TII->isSMRD(MI)) {
+
+ if (MI.getNumOperands() != 0) {
+ assert(MI.getOperand(0).isReg() &&
+ "First LGKM operand must be a register!");
+
+ // XXX - What if this is a write into a super register?
+ const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
+ unsigned Size = RC->getSize();
+ Result.Named.LGKM = Size > 4 ? 2 : 1;
+ } else {
+ // s_dcache_inv etc. do not have a a destination register. Assume we
+ // want a wait on these.
+ // XXX - What is the right value?
+ Result.Named.LGKM = 1;
+ }
} else {
// DS
Result.Named.LGKM = 1;
@@ -173,9 +184,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
}
bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
-
// Constants are always irrelevant
- if (!Op.isReg())
+ if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
return false;
// Defines are always relevant
@@ -196,7 +206,7 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
// operand comes before the value operand and it may have
// multiple data operands.
- if (TII->isDS(MI.getOpcode())) {
+ if (TII->isDS(MI)) {
MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
if (Data && Op.isIdenticalTo(*Data))
return true;
@@ -224,18 +234,13 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
return false;
}
-RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
-
- if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
- return std::make_pair(0, 0);
-
- unsigned Reg = Op.getReg();
- unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
-
+RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
+ const MachineOperand &Reg) const {
+ unsigned Size = RC->getSize();
assert(Size >= 4);
RegInterval Result;
- Result.first = TRI->getEncodingValue(Reg);
+ Result.first = TRI->getEncodingValue(Reg.getReg());
Result.second = Result.first + Size / 4;
return Result;
@@ -246,10 +251,13 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
// Get the hardware counter increments and sum them up
Counters Increment = getHwCounts(*I);
+ Counters Limit = ZeroCounts;
unsigned Sum = 0;
for (unsigned i = 0; i < 3; ++i) {
LastIssued.Array[i] += Increment.Array[i];
+ if (Increment.Array[i])
+ Limit.Array[i] = LastIssued.Array[i];
Sum += Increment.Array[i];
}
@@ -261,7 +269,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >=
AMDGPUSubtarget::VOLCANIC_ISLANDS) {
- // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM
+ // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
// or SMEM clause, respectively.
//
// The temporary workaround is to break the clauses with S_NOP.
@@ -270,7 +278,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
// and destination registers don't overlap, e.g. this is illegal:
// r0 = load r2
// r2 = load r0
- if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) ||
+ if ((LastOpcodeType == SMEM && TII->isSMRD(*I)) ||
(LastOpcodeType == VMEM && Increment.Named.VM)) {
// Insert a NOP to break the clause.
BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
@@ -278,7 +286,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
LastInstWritesM0 = false;
}
- if (TII->isSMRD(I->getOpcode()))
+ if (TII->isSMRD(*I))
LastOpcodeType = SMEM;
else if (Increment.Named.VM)
LastOpcodeType = VMEM;
@@ -290,21 +298,21 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
}
for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-
MachineOperand &Op = I->getOperand(i);
if (!isOpRelevant(Op))
continue;
- RegInterval Interval = getRegInterval(Op);
+ const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
+ RegInterval Interval = getRegInterval(RC, Op);
for (unsigned j = Interval.first; j < Interval.second; ++j) {
// Remember which registers we define
if (Op.isDef())
- DefinedRegs[j] = LastIssued;
+ DefinedRegs[j] = Limit;
// and which one we are using
if (Op.isUse())
- UsedRegs[j] = LastIssued;
+ UsedRegs[j] = Limit;
}
}
}
@@ -390,12 +398,18 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
if (MI.getOpcode() == AMDGPU::S_SENDMSG)
return LastIssued;
- // For each register affected by this
- // instruction increase the result sequence
+ // For each register affected by this instruction increase the result
+ // sequence.
+ //
+ // TODO: We could probably just look at explicit operands if we removed VCC /
+ // EXEC from SMRD dest reg classes.
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-
MachineOperand &Op = MI.getOperand(i);
- RegInterval Interval = getRegInterval(Op);
+ if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
+ continue;
+
+ const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
+ RegInterval Interval = getRegInterval(RC, Op);
for (unsigned j = Interval.first; j < Interval.second; ++j) {
if (Op.isDef()) {
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 211666a..0e883f6 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -41,6 +41,10 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
field bits<1> WQM = 0;
field bits<1> VGPRSpill = 0;
+ // This bit tells the assembler to use the 32-bit encoding in case it
+ // is unable to infer the encoding from the operands.
+ field bits<1> VOPAsmPrefer32Bit = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = VM_CNT;
let TSFlags{1} = EXP_CNT;
@@ -68,10 +72,8 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
let TSFlags{19} = FLAT;
let TSFlags{20} = WQM;
let TSFlags{21} = VGPRSpill;
+ let TSFlags{22} = VOPAsmPrefer32Bit;
- // Most instructions require adjustments after selection to satisfy
- // operand requirements.
- let hasPostISelHook = 1;
let SchedRW = [Write32Bit];
}
@@ -86,7 +88,6 @@ class Enc64 {
}
class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
-def VOPDstVCC : VOPDstOperand <VCCReg>;
let Uses = [EXEC] in {
@@ -101,11 +102,11 @@ class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
}
class VOPCCommon <dag ins, string asm, list<dag> pattern> :
- VOPAnyCommon <(outs VOPDstVCC:$dst), ins, asm, pattern> {
+ VOPAnyCommon <(outs), ins, asm, pattern> {
- let DisableEncoding = "$dst";
let VOPC = 1;
let Size = 4;
+ let Defs = [VCC];
}
class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> :
@@ -138,6 +139,11 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
let isCodeGenOnly = 0;
int Size = 8;
+
+ // Because SGPRs may be allowed if there are multiple operands, we
+ // need a post-isel hook to insert copies in order to avoid
+ // violating constant bus requirements.
+ let hasPostISelHook = 1;
}
} // End Uses = [EXEC]
@@ -222,6 +228,20 @@ class SMRDe <bits<5> op, bits<1> imm> : Enc32 {
let Inst{31-27} = 0x18; //encoding
}
+class SMRD_IMMe_ci <bits<5> op> : Enc64 {
+ bits<7> sdst;
+ bits<7> sbase;
+ bits<32> offset;
+
+ let Inst{7-0} = 0xff;
+ let Inst{8} = 0;
+ let Inst{14-9} = sbase{6-1};
+ let Inst{21-15} = sdst;
+ let Inst{26-22} = op;
+ let Inst{31-27} = 0x18; //encoding
+ let Inst{63-32} = offset;
+}
+
let SchedRW = [WriteSALU] in {
class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> :
InstSI<outs, ins, asm, pattern> {
@@ -249,13 +269,13 @@ class SOP2 <dag outs, dag ins, string asm, list<dag> pattern> :
class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
InstSI<outs, ins, asm, pattern>, SOPCe <op> {
- let DisableEncoding = "$dst";
let mayLoad = 0;
let mayStore = 0;
let hasSideEffects = 0;
let SALU = 1;
let SOPC = 1;
let isCodeGenOnly = 0;
+ let Defs = [SCC];
let UseNamedOperandTable = 1;
}
@@ -598,15 +618,13 @@ class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> :
// Vector I/O operations
//===----------------------------------------------------------------------===//
-let Uses = [EXEC] in {
-
class DS <dag outs, dag ins, string asm, list<dag> pattern> :
InstSI <outs, ins, asm, pattern> {
let LGKM_CNT = 1;
let DS = 1;
let UseNamedOperandTable = 1;
- let Uses = [M0];
+ let Uses = [M0, EXEC];
// Most instruction load and store data, so set this as the default.
let mayLoad = 1;
@@ -623,6 +641,7 @@ class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> :
let VM_CNT = 1;
let EXP_CNT = 1;
let MUBUF = 1;
+ let Uses = [EXEC];
let hasSideEffects = 0;
let UseNamedOperandTable = 1;
@@ -636,6 +655,7 @@ class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> :
let VM_CNT = 1;
let EXP_CNT = 1;
let MTBUF = 1;
+ let Uses = [EXEC];
let hasSideEffects = 0;
let UseNamedOperandTable = 1;
@@ -665,9 +685,7 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
let VM_CNT = 1;
let EXP_CNT = 1;
let MIMG = 1;
+ let Uses = [EXEC];
let hasSideEffects = 0; // XXX ????
}
-
-
-} // End Uses = [EXEC]
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index cfd2c42..a08a5a8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -82,6 +82,7 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
switch (MI->getOpcode()) {
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::V_MOV_B32_e64:
+ case AMDGPU::V_MOV_B64_PSEUDO:
return true;
default:
return false;
@@ -204,7 +205,8 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
unsigned &Offset,
const TargetRegisterInfo *TRI) const {
unsigned Opc = LdSt->getOpcode();
- if (isDS(Opc)) {
+
+ if (isDS(*LdSt)) {
const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
AMDGPU::OpName::offset);
if (OffsetImm) {
@@ -254,7 +256,7 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
return false;
}
- if (isMUBUF(Opc) || isMTBUF(Opc)) {
+ if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) {
if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1)
return false;
@@ -270,7 +272,7 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
return true;
}
- if (isSMRD(Opc)) {
+ if (isSMRD(*LdSt)) {
const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
AMDGPU::OpName::offset);
if (!OffsetImm)
@@ -289,20 +291,18 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
MachineInstr *SecondLdSt,
unsigned NumLoads) const {
- unsigned Opc0 = FirstLdSt->getOpcode();
- unsigned Opc1 = SecondLdSt->getOpcode();
-
// TODO: This needs finer tuning
if (NumLoads > 4)
return false;
- if (isDS(Opc0) && isDS(Opc1))
+ if (isDS(*FirstLdSt) && isDS(*SecondLdSt))
return true;
- if (isSMRD(Opc0) && isSMRD(Opc1))
+ if (isSMRD(*FirstLdSt) && isSMRD(*SecondLdSt))
return true;
- if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1)))
+ if ((isMUBUF(*FirstLdSt) || isMTBUF(*FirstLdSt)) &&
+ (isMUBUF(*SecondLdSt) || isMTBUF(*SecondLdSt)))
return true;
return false;
@@ -323,28 +323,45 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
- AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0
+ AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
+ };
+
+ static const int16_t Sub0_15_64[] = {
+ AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
+ AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
+ AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
+ AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
};
static const int16_t Sub0_7[] = {
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
- AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0
+ AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+ };
+
+ static const int16_t Sub0_7_64[] = {
+ AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
+ AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
};
static const int16_t Sub0_3[] = {
- AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+ };
+
+ static const int16_t Sub0_3_64[] = {
+ AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
};
static const int16_t Sub0_2[] = {
- AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2,
};
static const int16_t Sub0_1[] = {
- AMDGPU::sub0, AMDGPU::sub1, 0
+ AMDGPU::sub0, AMDGPU::sub1,
};
unsigned Opcode;
- const int16_t *SubIndices;
+ ArrayRef<int16_t> SubIndices;
+ bool Forward;
if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
@@ -360,7 +377,7 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
} else {
// FIXME: Hack until VReg_1 removed.
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
- BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC)
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32))
.addImm(0)
.addReg(SrcReg, getKillRegState(KillSrc));
}
@@ -375,18 +392,18 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
} else if (AMDGPU::SReg_128RegClass.contains(DestReg)) {
assert(AMDGPU::SReg_128RegClass.contains(SrcReg));
- Opcode = AMDGPU::S_MOV_B32;
- SubIndices = Sub0_3;
+ Opcode = AMDGPU::S_MOV_B64;
+ SubIndices = Sub0_3_64;
} else if (AMDGPU::SReg_256RegClass.contains(DestReg)) {
assert(AMDGPU::SReg_256RegClass.contains(SrcReg));
- Opcode = AMDGPU::S_MOV_B32;
- SubIndices = Sub0_7;
+ Opcode = AMDGPU::S_MOV_B64;
+ SubIndices = Sub0_7_64;
} else if (AMDGPU::SReg_512RegClass.contains(DestReg)) {
assert(AMDGPU::SReg_512RegClass.contains(SrcReg));
- Opcode = AMDGPU::S_MOV_B32;
- SubIndices = Sub0_15;
+ Opcode = AMDGPU::S_MOV_B64;
+ SubIndices = Sub0_15_64;
} else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) {
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
@@ -428,13 +445,27 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
llvm_unreachable("Can't copy register!");
}
- while (unsigned SubIdx = *SubIndices++) {
+ if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg))
+ Forward = true;
+ else
+ Forward = false;
+
+ for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
+ unsigned SubIdx;
+ if (Forward)
+ SubIdx = SubIndices[Idx];
+ else
+ SubIdx = SubIndices[SubIndices.size() - Idx - 1];
+
MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
get(Opcode), RI.getSubReg(DestReg, SubIdx));
- Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc));
+ Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
+
+ if (Idx == SubIndices.size() - 1)
+ Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit);
- if (*SubIndices)
+ if (Idx == 0)
Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
}
}
@@ -471,6 +502,40 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
return AMDGPU::COPY;
}
+static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMDGPU::SI_SPILL_S32_SAVE;
+ case 8:
+ return AMDGPU::SI_SPILL_S64_SAVE;
+ case 16:
+ return AMDGPU::SI_SPILL_S128_SAVE;
+ case 32:
+ return AMDGPU::SI_SPILL_S256_SAVE;
+ case 64:
+ return AMDGPU::SI_SPILL_S512_SAVE;
+ default:
+ llvm_unreachable("unknown register size");
+ }
+}
+
+static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMDGPU::SI_SPILL_V32_SAVE;
+ case 8:
+ return AMDGPU::SI_SPILL_V64_SAVE;
+ case 16:
+ return AMDGPU::SI_SPILL_V128_SAVE;
+ case 32:
+ return AMDGPU::SI_SPILL_V256_SAVE;
+ case 64:
+ return AMDGPU::SI_SPILL_V512_SAVE;
+ default:
+ llvm_unreachable("unknown register size");
+ }
+}
+
void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned SrcReg, bool isKill,
@@ -481,47 +546,83 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo *FrameInfo = MF->getFrameInfo();
DebugLoc DL = MBB.findDebugLoc(MI);
- int Opcode = -1;
+
+ unsigned Size = FrameInfo->getObjectSize(FrameIndex);
+ unsigned Align = FrameInfo->getObjectAlignment(FrameIndex);
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
+ MachineMemOperand *MMO
+ = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+ Size, Align);
if (RI.isSGPRClass(RC)) {
+ MFI->setHasSpilledSGPRs();
+
// We are only allowed to create one new instruction when spilling
// registers, so we need to use pseudo instruction for spilling
// SGPRs.
- switch (RC->getSize() * 8) {
- case 32: Opcode = AMDGPU::SI_SPILL_S32_SAVE; break;
- case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break;
- case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break;
- case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
- case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
- }
- } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
- MFI->setHasSpilledVGPRs();
-
- switch(RC->getSize() * 8) {
- case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break;
- case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break;
- case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break;
- case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break;
- case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break;
- case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break;
- }
+ unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize());
+ BuildMI(MBB, MI, DL, get(Opcode))
+ .addReg(SrcReg) // src
+ .addFrameIndex(FrameIndex) // frame_idx
+ .addMemOperand(MMO);
+
+ return;
}
- if (Opcode != -1) {
- FrameInfo->setObjectAlignment(FrameIndex, 4);
- BuildMI(MBB, MI, DL, get(Opcode))
- .addReg(SrcReg)
- .addFrameIndex(FrameIndex)
- // Place-holder registers, these will be filled in by
- // SIPrepareScratchRegs.
- .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
- .addReg(AMDGPU::SGPR0, RegState::Undef);
- } else {
+ if (!ST.isVGPRSpillingEnabled(MFI)) {
LLVMContext &Ctx = MF->getFunction()->getContext();
Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
" spill register");
BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
- .addReg(SrcReg);
+ .addReg(SrcReg);
+
+ return;
+ }
+
+ assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
+
+ unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
+ MFI->setHasSpilledVGPRs();
+ BuildMI(MBB, MI, DL, get(Opcode))
+ .addReg(SrcReg) // src
+ .addFrameIndex(FrameIndex) // frame_idx
+ .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+ .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+ .addMemOperand(MMO);
+}
+
+static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMDGPU::SI_SPILL_S32_RESTORE;
+ case 8:
+ return AMDGPU::SI_SPILL_S64_RESTORE;
+ case 16:
+ return AMDGPU::SI_SPILL_S128_RESTORE;
+ case 32:
+ return AMDGPU::SI_SPILL_S256_RESTORE;
+ case 64:
+ return AMDGPU::SI_SPILL_S512_RESTORE;
+ default:
+ llvm_unreachable("unknown register size");
+ }
+}
+
+static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMDGPU::SI_SPILL_V32_RESTORE;
+ case 8:
+ return AMDGPU::SI_SPILL_V64_RESTORE;
+ case 16:
+ return AMDGPU::SI_SPILL_V128_RESTORE;
+ case 32:
+ return AMDGPU::SI_SPILL_V256_RESTORE;
+ case 64:
+ return AMDGPU::SI_SPILL_V512_RESTORE;
+ default:
+ llvm_unreachable("unknown register size");
}
}
@@ -534,42 +635,43 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo *FrameInfo = MF->getFrameInfo();
DebugLoc DL = MBB.findDebugLoc(MI);
- int Opcode = -1;
-
- if (RI.isSGPRClass(RC)){
- switch(RC->getSize() * 8) {
- case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break;
- case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break;
- case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
- case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
- case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
- }
- } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) {
- switch(RC->getSize() * 8) {
- case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break;
- case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break;
- case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break;
- case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break;
- case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break;
- case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break;
- }
- }
+ unsigned Align = FrameInfo->getObjectAlignment(FrameIndex);
+ unsigned Size = FrameInfo->getObjectSize(FrameIndex);
+
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
- if (Opcode != -1) {
- FrameInfo->setObjectAlignment(FrameIndex, 4);
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOLoad, Size, Align);
+
+ if (RI.isSGPRClass(RC)) {
+ // FIXME: Maybe this should not include a memoperand because it will be
+ // lowered to non-memory instructions.
+ unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize());
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
- .addFrameIndex(FrameIndex)
- // Place-holder registers, these will be filled in by
- // SIPrepareScratchRegs.
- .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
- .addReg(AMDGPU::SGPR0, RegState::Undef);
+ .addFrameIndex(FrameIndex) // frame_idx
+ .addMemOperand(MMO);
- } else {
+ return;
+ }
+
+ if (!ST.isVGPRSpillingEnabled(MFI)) {
LLVMContext &Ctx = MF->getFunction()->getContext();
Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
" restore register");
BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
+
+ return;
}
+
+ assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
+
+ unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
+ BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+ .addFrameIndex(FrameIndex) // frame_idx
+ .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+ .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+ .addMemOperand(MMO);
}
/// \param @Offset Offset in bytes of the FrameIndex being spilled
@@ -601,17 +703,21 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
if (MFI->getShaderType() == ShaderType::COMPUTE &&
WorkGroupSize > WavefrontSize) {
- unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X);
- unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y);
- unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z);
+ unsigned TIDIGXReg
+ = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X);
+ unsigned TIDIGYReg
+ = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y);
+ unsigned TIDIGZReg
+ = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z);
unsigned InputPtrReg =
- TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR);
+ TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
if (!Entry.isLiveIn(Reg))
Entry.addLiveIn(Reg);
}
RS->enterBasicBlock(&Entry);
+ // FIXME: Can we scavenge an SReg_64 and access the subregs?
unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
@@ -667,8 +773,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
return TmpReg;
}
-void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI,
- int Count) const {
+void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI,
+ int Count) const {
while (Count > 0) {
int Arg;
if (Count >= 8)
@@ -687,26 +793,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
switch (MI->getOpcode()) {
default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
- case AMDGPU::SI_CONSTDATA_PTR: {
- unsigned Reg = MI->getOperand(0).getReg();
- unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
- unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
-
- BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg);
-
- // Add 32-bit offset from this instruction to the start of the constant data.
- BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo)
- .addReg(RegLo)
- .addTargetIndex(AMDGPU::TI_CONSTDATA_START)
- .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit);
- BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi)
- .addReg(RegHi)
- .addImm(0)
- .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit)
- .addReg(AMDGPU::SCC, RegState::Implicit);
- MI->eraseFromParent();
- break;
- }
case AMDGPU::SGPR_USE:
// This is just a placeholder for register allocation.
MI->eraseFromParent();
@@ -760,49 +846,90 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
MI->eraseFromParent();
break;
}
+
+ case AMDGPU::SI_CONSTDATA_PTR: {
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo *>(ST.getRegisterInfo());
+ MachineFunction &MF = *MBB.getParent();
+ unsigned Reg = MI->getOperand(0).getReg();
+ unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0);
+ unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1);
+
+ // Create a bundle so these instructions won't be re-ordered by the
+ // post-RA scheduler.
+ MIBundleBuilder Bundler(MBB, MI);
+ Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
+
+ // Add 32-bit offset from this instruction to the start of the
+ // constant data.
+ Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
+ .addReg(RegLo)
+ .addOperand(MI->getOperand(1)));
+ Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
+ .addReg(RegHi)
+ .addImm(0));
+
+ llvm::finalizeBundle(MBB, Bundler.begin());
+
+ MI->eraseFromParent();
+ break;
+ }
}
return true;
}
-MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
- bool NewMI) const {
-
- if (MI->getNumOperands() < 3)
- return nullptr;
-
+/// Commutes the operands in the given instruction.
+/// The commutable operands are specified by their indices OpIdx0 and OpIdx1.
+///
+/// Do not call this method for a non-commutable instruction or for
+/// non-commutable pair of operand indices OpIdx0 and OpIdx1.
+/// Even though the instruction is commutable, the method may still
+/// fail to commute the operands, null pointer is returned in such cases.
+MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI,
+ bool NewMI,
+ unsigned OpIdx0,
+ unsigned OpIdx1) const {
int CommutedOpcode = commuteOpcode(*MI);
if (CommutedOpcode == -1)
return nullptr;
int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
AMDGPU::OpName::src0);
- assert(Src0Idx != -1 && "Should always have src0 operand");
-
MachineOperand &Src0 = MI->getOperand(Src0Idx);
if (!Src0.isReg())
return nullptr;
int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
AMDGPU::OpName::src1);
- if (Src1Idx == -1)
+
+ if ((OpIdx0 != static_cast<unsigned>(Src0Idx) ||
+ OpIdx1 != static_cast<unsigned>(Src1Idx)) &&
+ (OpIdx0 != static_cast<unsigned>(Src1Idx) ||
+ OpIdx1 != static_cast<unsigned>(Src0Idx)))
return nullptr;
MachineOperand &Src1 = MI->getOperand(Src1Idx);
- // Make sure it's legal to commute operands for VOP2.
- if (isVOP2(MI->getOpcode()) &&
- (!isOperandLegal(MI, Src0Idx, &Src1) ||
- !isOperandLegal(MI, Src1Idx, &Src0))) {
- return nullptr;
+
+ if (isVOP2(*MI)) {
+ const MCInstrDesc &InstrDesc = MI->getDesc();
+ // For VOP2 instructions, any operand type is valid to use for src0. Make
+ // sure we can use the src1 as src0.
+ //
+ // We could be stricter here and only allow commuting if there is a reason
+ // to do so. i.e. if both operands are VGPRs there is no real benefit,
+ // although MachineCSE attempts to find matches by commuting.
+ const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+ if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0))
+ return nullptr;
}
if (!Src1.isReg()) {
// Allow commuting instructions with Imm operands.
if (NewMI || !Src1.isImm() ||
- (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) {
+ (!isVOP2(*MI) && !isVOP3(*MI))) {
return nullptr;
}
-
// Be sure to copy the source modifiers to the right place.
if (MachineOperand *Src0Mods
= getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
@@ -832,7 +959,7 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
Src1.ChangeToRegister(Reg, false);
Src1.setSubReg(SubReg);
} else {
- MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
+ MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1);
}
if (MI)
@@ -845,8 +972,8 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
// between the true commutable operands, and the base
// TargetInstrInfo::commuteInstruction uses it.
bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
- unsigned &SrcOpIdx1,
- unsigned &SrcOpIdx2) const {
+ unsigned &SrcOpIdx0,
+ unsigned &SrcOpIdx1) const {
const MCInstrDesc &MCID = MI->getDesc();
if (!MCID.isCommutable())
return false;
@@ -857,7 +984,8 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
return false;
// FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on
- // immediate.
+ // immediate. Also, immediate src0 operand is not handled in
+ // SIInstrInfo::commuteInstruction();
if (!MI->getOperand(Src0Idx).isReg())
return false;
@@ -865,18 +993,22 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
if (Src1Idx == -1)
return false;
- if (!MI->getOperand(Src1Idx).isReg())
- return false;
-
- // If any source modifiers are set, the generic instruction commuting won't
- // understand how to copy the source modifiers.
- if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) ||
- hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
+ MachineOperand &Src1 = MI->getOperand(Src1Idx);
+ if (Src1.isImm()) {
+ // SIInstrInfo::commuteInstruction() does support commuting the immediate
+ // operand src1 in 2 and 3 operand instructions.
+ if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))
+ return false;
+ } else if (Src1.isReg()) {
+ // If any source modifiers are set, the generic instruction commuting won't
+ // understand how to copy the source modifiers.
+ if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) ||
+ hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
+ return false;
+ } else
return false;
- SrcOpIdx1 = Src0Idx;
- SrcOpIdx2 = Src1Idx;
- return true;
+ return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
}
MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB,
@@ -898,11 +1030,6 @@ bool SIInstrInfo::isMov(unsigned Opcode) const {
}
}
-bool
-SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
- return RC != &AMDGPU::EXECRegRegClass;
-}
-
static void removeModOperands(MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
@@ -984,9 +1111,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
}
- UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
- AMDGPU::OpName::src2));
- // ChangingToImmediate adds Src2 back to the instruction.
Src2->ChangeToImmediate(Imm);
removeModOperands(*UseMI);
@@ -1045,18 +1169,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
return false;
}
-bool
-SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI,
- AliasAnalysis *AA) const {
- switch(MI->getOpcode()) {
- default: return AMDGPUInstrInfo::isTriviallyReMaterializable(MI, AA);
- case AMDGPU::S_MOV_B32:
- case AMDGPU::S_MOV_B64:
- case AMDGPU::V_MOV_B32_e32:
- return MI->getOperand(1).isImm();
- }
-}
-
static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
int WidthB, int OffsetB) {
int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
@@ -1088,9 +1200,6 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
MachineInstr *MIb,
AliasAnalysis *AA) const {
- unsigned Opc0 = MIa->getOpcode();
- unsigned Opc1 = MIb->getOpcode();
-
assert(MIa && (MIa->mayLoad() || MIa->mayStore()) &&
"MIa must load from or modify a memory location");
assert(MIb && (MIb->mayLoad() || MIb->mayStore()) &&
@@ -1105,32 +1214,32 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
// TODO: Should we check the address space from the MachineMemOperand? That
// would allow us to distinguish objects we know don't alias based on the
- // underlying addres space, even if it was lowered to a different one,
+ // underlying address space, even if it was lowered to a different one,
// e.g. private accesses lowered to use MUBUF instructions on a scratch
// buffer.
- if (isDS(Opc0)) {
- if (isDS(Opc1))
+ if (isDS(*MIa)) {
+ if (isDS(*MIb))
return checkInstOffsetsDoNotOverlap(MIa, MIb);
- return !isFLAT(Opc1);
+ return !isFLAT(*MIb);
}
- if (isMUBUF(Opc0) || isMTBUF(Opc0)) {
- if (isMUBUF(Opc1) || isMTBUF(Opc1))
+ if (isMUBUF(*MIa) || isMTBUF(*MIa)) {
+ if (isMUBUF(*MIb) || isMTBUF(*MIb))
return checkInstOffsetsDoNotOverlap(MIa, MIb);
- return !isFLAT(Opc1) && !isSMRD(Opc1);
+ return !isFLAT(*MIb) && !isSMRD(*MIb);
}
- if (isSMRD(Opc0)) {
- if (isSMRD(Opc1))
+ if (isSMRD(*MIa)) {
+ if (isSMRD(*MIb))
return checkInstOffsetsDoNotOverlap(MIa, MIb);
- return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0);
+ return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa);
}
- if (isFLAT(Opc0)) {
- if (isFLAT(Opc1))
+ if (isFLAT(*MIa)) {
+ if (isFLAT(*MIb))
return checkInstOffsetsDoNotOverlap(MIa, MIb);
return false;
@@ -1319,6 +1428,26 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
return false;
}
+static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
+ for (const MachineOperand &MO : MI.implicit_operands()) {
+ // We only care about reads.
+ if (MO.isDef())
+ continue;
+
+ switch (MO.getReg()) {
+ case AMDGPU::VCC:
+ case AMDGPU::M0:
+ case AMDGPU::FLAT_SCR:
+ return MO.getReg();
+
+ default:
+ break;
+ }
+ }
+
+ return AMDGPU::NoRegister;
+}
+
bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
StringRef &ErrInfo) const {
uint16_t Opcode = MI->getOpcode();
@@ -1335,7 +1464,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
return false;
}
- // Make sure the register classes are correct
+ // Make sure the register classes are correct.
for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
if (MI->getOperand(i).isFPImm()) {
ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
@@ -1392,14 +1521,17 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
// Verify VOP*
- if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) {
+ if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) {
// Only look at the true operands. Only a real operand can use the constant
// bus, and we don't want to check pseudo-operands like the source modifier
// flags.
const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
unsigned ConstantBusCount = 0;
- unsigned SGPRUsed = AMDGPU::NoRegister;
+ unsigned SGPRUsed = findImplicitSGPRRead(*MI);
+ if (SGPRUsed != AMDGPU::NoRegister)
+ ++ConstantBusCount;
+
for (int OpIdx : OpIndices) {
if (OpIdx == -1)
break;
@@ -1435,6 +1567,16 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
}
}
+ // Make sure we aren't losing exec uses in the td files. This mostly requires
+ // being careful when using let Uses to try to add other use registers.
+ if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) {
+ const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC);
+ if (!Exec || !Exec->isImplicit()) {
+ ErrInfo = "VALU instruction does not implicitly read exec mask";
+ return false;
+ }
+ }
+
return true;
}
@@ -1483,11 +1625,17 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
case AMDGPU::S_LOAD_DWORD_IMM:
- case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
+ case AMDGPU::S_LOAD_DWORD_SGPR:
+ case AMDGPU::S_LOAD_DWORD_IMM_ci:
+ return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
case AMDGPU::S_LOAD_DWORDX2_IMM:
- case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
+ case AMDGPU::S_LOAD_DWORDX2_SGPR:
+ case AMDGPU::S_LOAD_DWORDX2_IMM_ci:
+ return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
case AMDGPU::S_LOAD_DWORDX4_IMM:
- case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
+ case AMDGPU::S_LOAD_DWORDX4_SGPR:
+ case AMDGPU::S_LOAD_DWORDX4_IMM_ci:
+ return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
@@ -1562,17 +1710,21 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
unsigned SubIdx,
const TargetRegisterClass *SubRC)
const {
- assert(SuperReg.isReg());
-
- unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
+ MachineBasicBlock *MBB = MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
unsigned SubReg = MRI.createVirtualRegister(SubRC);
+ if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
+ BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
+ .addReg(SuperReg.getReg(), 0, SubIdx);
+ return SubReg;
+ }
+
// Just in case the super register is itself a sub-register, copy it to a new
// value so we don't need to worry about merging its subreg index with the
// SubIdx passed to this function. The register coalescer should be able to
// eliminate this extra copy.
- MachineBasicBlock *MBB = MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
+ unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
.addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
@@ -1605,36 +1757,6 @@ MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
return MachineOperand::CreateReg(SubReg, false);
}
-unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
- MachineBasicBlock::iterator MI,
- MachineRegisterInfo &MRI,
- const TargetRegisterClass *RC,
- const MachineOperand &Op) const {
- MachineBasicBlock *MBB = MI->getParent();
- DebugLoc DL = MI->getDebugLoc();
- unsigned LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned Dst = MRI.createVirtualRegister(RC);
-
- MachineInstr *Lo = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32),
- LoDst)
- .addImm(Op.getImm() & 0xFFFFFFFF);
- MachineInstr *Hi = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32),
- HiDst)
- .addImm(Op.getImm() >> 32);
-
- BuildMI(*MBB, MI, DL, get(TargetOpcode::REG_SEQUENCE), Dst)
- .addReg(LoDst)
- .addImm(AMDGPU::sub0)
- .addReg(HiDst)
- .addImm(AMDGPU::sub1);
-
- Worklist.push_back(Lo);
- Worklist.push_back(Hi);
-
- return Dst;
-}
-
// Change the order of operands from (0, 1, 2) to (0, 2, 1)
void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
assert(Inst->getNumExplicitOperands() == 3);
@@ -1643,6 +1765,41 @@ void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
Inst->addOperand(Op1);
}
+bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
+ const MCOperandInfo &OpInfo,
+ const MachineOperand &MO) const {
+ if (!MO.isReg())
+ return false;
+
+ unsigned Reg = MO.getReg();
+ const TargetRegisterClass *RC =
+ TargetRegisterInfo::isVirtualRegister(Reg) ?
+ MRI.getRegClass(Reg) :
+ RI.getPhysRegClass(Reg);
+
+ // In order to be legal, the common sub-class must be equal to the
+ // class of the current operand. For example:
+ //
+ // v_mov_b32 s0 ; Operand defined as vsrc_32
+ // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
+ //
+ // s_sendmsg 0, s0 ; Operand defined as m0reg
+ // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
+
+ return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
+}
+
+bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
+ const MCOperandInfo &OpInfo,
+ const MachineOperand &MO) const {
+ if (MO.isReg())
+ return isLegalRegOperand(MRI, OpInfo, MO);
+
+ // Handle non-register types that are treated like immediates.
+ assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+ return true;
+}
+
bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
const MachineOperand *MO) const {
const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
@@ -1653,7 +1810,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
if (!MO)
MO = &MI->getOperand(OpIdx);
- if (isVALU(InstDesc.Opcode) &&
+ if (isVALU(*MI) &&
usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
unsigned SGPRUsed =
MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister;
@@ -1670,21 +1827,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
if (MO->isReg()) {
assert(DefinedRC);
- const TargetRegisterClass *RC =
- TargetRegisterInfo::isVirtualRegister(MO->getReg()) ?
- MRI.getRegClass(MO->getReg()) :
- RI.getPhysRegClass(MO->getReg());
-
- // In order to be legal, the common sub-class must be equal to the
- // class of the current operand. For example:
- //
- // v_mov_b32 s0 ; Operand defined as vsrc_32
- // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
- //
- // s_sendmsg 0, s0 ; Operand defined as m0reg
- // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
-
- return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
+ return isLegalRegOperand(MRI, OpInfo, *MO);
}
@@ -1699,81 +1842,143 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
return isImmOperandLegal(MI, OpIdx, *MO);
}
-void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
- MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
+ MachineInstr *MI) const {
+ unsigned Opc = MI->getOpcode();
+ const MCInstrDesc &InstrDesc = get(Opc);
- int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
- AMDGPU::OpName::src0);
- int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
- AMDGPU::OpName::src1);
- int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
- AMDGPU::OpName::src2);
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+ MachineOperand &Src1 = MI->getOperand(Src1Idx);
- // Legalize VOP2
- if (isVOP2(MI->getOpcode()) && Src1Idx != -1) {
- // Legalize src0
- if (!isOperandLegal(MI, Src0Idx))
+ // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
+ // we need to only have one constant bus use.
+ //
+ // Note we do not need to worry about literal constants here. They are
+ // disabled for the operand type for instructions because they will always
+ // violate the one constant bus use rule.
+ bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister;
+ if (HasImplicitSGPR) {
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ MachineOperand &Src0 = MI->getOperand(Src0Idx);
+
+ if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
legalizeOpWithMove(MI, Src0Idx);
+ }
- // Legalize src1
- if (isOperandLegal(MI, Src1Idx))
- return;
+ // VOP2 src0 instructions support all operand types, so we don't need to check
+ // their legality. If src1 is already legal, we don't need to do anything.
+ if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
+ return;
- // Usually src0 of VOP2 instructions allow more types of inputs
- // than src1, so try to commute the instruction to decrease our
- // chances of having to insert a MOV instruction to legalize src1.
- if (MI->isCommutable()) {
- if (commuteInstruction(MI))
- // If we are successful in commuting, then we know MI is legal, so
- // we are done.
- return;
- }
+ // We do not use commuteInstruction here because it is too aggressive and will
+ // commute if it is possible. We only want to commute here if it improves
+ // legality. This can be called a fairly large number of times so don't waste
+ // compile time pointlessly swapping and checking legality again.
+ if (HasImplicitSGPR || !MI->isCommutable()) {
+ legalizeOpWithMove(MI, Src1Idx);
+ return;
+ }
+
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ MachineOperand &Src0 = MI->getOperand(Src0Idx);
+ // If src0 can be used as src1, commuting will make the operands legal.
+ // Otherwise we have to give up and insert a move.
+ //
+ // TODO: Other immediate-like operand kinds could be commuted if there was a
+ // MachineOperand::ChangeTo* for them.
+ if ((!Src1.isImm() && !Src1.isReg()) ||
+ !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
legalizeOpWithMove(MI, Src1Idx);
return;
}
- // XXX - Do any VOP3 instructions read VCC?
- // Legalize VOP3
- if (isVOP3(MI->getOpcode())) {
- int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx };
+ int CommutedOpc = commuteOpcode(*MI);
+ if (CommutedOpc == -1) {
+ legalizeOpWithMove(MI, Src1Idx);
+ return;
+ }
- // Find the one SGPR operand we are allowed to use.
- unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
+ MI->setDesc(get(CommutedOpc));
- for (unsigned i = 0; i < 3; ++i) {
- int Idx = VOP3Idx[i];
- if (Idx == -1)
- break;
- MachineOperand &MO = MI->getOperand(Idx);
+ unsigned Src0Reg = Src0.getReg();
+ unsigned Src0SubReg = Src0.getSubReg();
+ bool Src0Kill = Src0.isKill();
- if (MO.isReg()) {
- if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
- continue; // VGPRs are legal
+ if (Src1.isImm())
+ Src0.ChangeToImmediate(Src1.getImm());
+ else if (Src1.isReg()) {
+ Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
+ Src0.setSubReg(Src1.getSubReg());
+ } else
+ llvm_unreachable("Should only have register or immediate operands");
- assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction");
+ Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
+ Src1.setSubReg(Src0SubReg);
+}
- if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
- SGPRReg = MO.getReg();
- // We can use one SGPR in each VOP3 instruction.
- continue;
- }
- } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) {
- // If it is not a register and not a literal constant, then it must be
- // an inline constant which is always legal.
- continue;
- }
- // If we make it this far, then the operand is not legal and we must
- // legalize it.
- legalizeOpWithMove(MI, Idx);
+// Legalize VOP3 operands. Because all operand types are supported for any
+// operand, and since literal constants are not allowed and should never be
+// seen, we only need to worry about inserting copies if we use multiple SGPR
+// operands.
+void SIInstrInfo::legalizeOperandsVOP3(
+ MachineRegisterInfo &MRI,
+ MachineInstr *MI) const {
+ unsigned Opc = MI->getOpcode();
+
+ int VOP3Idx[3] = {
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
+ };
+
+ // Find the one SGPR operand we are allowed to use.
+ unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
+
+ for (unsigned i = 0; i < 3; ++i) {
+ int Idx = VOP3Idx[i];
+ if (Idx == -1)
+ break;
+ MachineOperand &MO = MI->getOperand(Idx);
+
+ // We should never see a VOP3 instruction with an illegal immediate operand.
+ if (!MO.isReg())
+ continue;
+
+ if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
+ continue; // VGPRs are legal
+
+ if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
+ SGPRReg = MO.getReg();
+ // We can use one SGPR in each VOP3 instruction.
+ continue;
}
+
+ // If we make it this far, then the operand is not legal and we must
+ // legalize it.
+ legalizeOpWithMove(MI, Idx);
+ }
+}
+
+void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
+ MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+
+ // Legalize VOP2
+ if (isVOP2(*MI)) {
+ legalizeOperandsVOP2(MRI, MI);
+ return;
+ }
+
+ // Legalize VOP3
+ if (isVOP3(*MI)) {
+ legalizeOperandsVOP3(MRI, MI);
+ return;
}
// Legalize REG_SEQUENCE and PHI
// The register class of the operands much be the same type as the register
// class of the output.
- if (MI->getOpcode() == AMDGPU::REG_SEQUENCE ||
- MI->getOpcode() == AMDGPU::PHI) {
+ if (MI->getOpcode() == AMDGPU::PHI) {
const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
if (!MI->getOperand(i).isReg() ||
@@ -1802,26 +2007,53 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
}
// Update all the operands so they have the same type.
- for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
- if (!MI->getOperand(i).isReg() ||
- !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
+ for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
+ MachineOperand &Op = MI->getOperand(I);
+ if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
continue;
unsigned DstReg = MRI.createVirtualRegister(RC);
- MachineBasicBlock *InsertBB;
- MachineBasicBlock::iterator Insert;
- if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
- InsertBB = MI->getParent();
- Insert = MI;
- } else {
- // MI is a PHI instruction.
- InsertBB = MI->getOperand(i + 1).getMBB();
- Insert = InsertBB->getFirstTerminator();
+
+ // MI is a PHI instruction.
+ MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB();
+ MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
+
+ BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg)
+ .addOperand(Op);
+ Op.setReg(DstReg);
+ }
+ }
+
+ // REG_SEQUENCE doesn't really require operand legalization, but if one has a
+ // VGPR dest type and SGPR sources, insert copies so all operands are
+ // VGPRs. This seems to help operand folding / the register coalescer.
+ if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
+ MachineBasicBlock *MBB = MI->getParent();
+ const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0);
+ if (RI.hasVGPRs(DstRC)) {
+ // Update all the operands so they are VGPR register classes. These may
+ // not be the same register class because REG_SEQUENCE supports mixing
+ // subregister index types e.g. sub0_sub1 + sub2 + sub3
+ for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
+ MachineOperand &Op = MI->getOperand(I);
+ if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
+ continue;
+
+ const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
+ const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
+ if (VRC == OpRC)
+ continue;
+
+ unsigned DstReg = MRI.createVirtualRegister(VRC);
+
+ BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg)
+ .addOperand(Op);
+
+ Op.setReg(DstReg);
+ Op.setIsKill();
}
- BuildMI(*InsertBB, Insert, MI->getDebugLoc(),
- get(AMDGPU::COPY), DstReg)
- .addOperand(MI->getOperand(i));
- MI->getOperand(i).setReg(DstReg);
}
+
+ return;
}
// Legalize INSERT_SUBREG
@@ -1858,15 +2090,10 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
}
MachineBasicBlock &MBB = *MI->getParent();
- // Extract the ptr from the resource descriptor.
-
- // SRsrcPtrLo = srsrc:sub0
- unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc,
- &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass);
- // SRsrcPtrHi = srsrc:sub1
- unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc,
- &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass);
+ // Extract the ptr from the resource descriptor.
+ unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
+ &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
// Create an empty resource descriptor
unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@@ -1891,80 +2118,112 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
.addImm(RsrcDataFormat >> 32);
// NewSRsrc = {Zero64, SRsrcFormat}
- BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
- NewSRsrc)
- .addReg(Zero64)
- .addImm(AMDGPU::sub0_sub1)
- .addReg(SRsrcFormatLo)
- .addImm(AMDGPU::sub2)
- .addReg(SRsrcFormatHi)
- .addImm(AMDGPU::sub3);
+ BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
+ .addReg(Zero64)
+ .addImm(AMDGPU::sub0_sub1)
+ .addReg(SRsrcFormatLo)
+ .addImm(AMDGPU::sub2)
+ .addReg(SRsrcFormatHi)
+ .addImm(AMDGPU::sub3);
MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
- unsigned NewVAddrLo;
- unsigned NewVAddrHi;
if (VAddr) {
// This is already an ADDR64 instruction so we need to add the pointer
// extracted from the resource descriptor to the current value of VAddr.
- NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- // NewVaddrLo = SRsrcPtrLo + VAddr:sub0
- BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32),
- NewVAddrLo)
- .addReg(SRsrcPtrLo)
- .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
- .addReg(AMDGPU::VCC, RegState::ImplicitDefine);
-
- // NewVaddrHi = SRsrcPtrHi + VAddr:sub1
- BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32),
- NewVAddrHi)
- .addReg(SRsrcPtrHi)
- .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
- .addReg(AMDGPU::VCC, RegState::ImplicitDefine)
- .addReg(AMDGPU::VCC, RegState::Implicit);
-
+ unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
+ DebugLoc DL = MI->getDebugLoc();
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
+ .addReg(SRsrcPtr, 0, AMDGPU::sub0)
+ .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
+
+ // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
+ .addReg(SRsrcPtr, 0, AMDGPU::sub1)
+ .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
+
+ // NewVaddr = {NewVaddrHi, NewVaddrLo}
+ BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
+ .addReg(NewVAddrLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(NewVAddrHi)
+ .addImm(AMDGPU::sub1);
} else {
// This instructions is the _OFFSET variant, so we need to convert it to
// ADDR64.
+ assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration()
+ < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+ "FIXME: Need to emit flat atomics here");
+
MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata);
MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset);
MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset);
-
- // Create the new instruction.
unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode());
- MachineInstr *Addr64 =
- BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
- .addOperand(*VData)
- .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
- // This will be replaced later
- // with the new value of vaddr.
- .addOperand(*SRsrc)
- .addOperand(*SOffset)
- .addOperand(*Offset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0); // tfe
+
+ // Atomics rith return have have an additional tied operand and are
+ // missing some of the special bits.
+ MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in);
+ MachineInstr *Addr64;
+
+ if (!VDataIn) {
+ // Regular buffer load / store.
+ MachineInstrBuilder MIB
+ = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
+ .addOperand(*VData)
+ .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
+ // This will be replaced later
+ // with the new value of vaddr.
+ .addOperand(*SRsrc)
+ .addOperand(*SOffset)
+ .addOperand(*Offset);
+
+ // Atomics do not have this operand.
+ if (const MachineOperand *GLC
+ = getNamedOperand(*MI, AMDGPU::OpName::glc)) {
+ MIB.addImm(GLC->getImm());
+ }
+
+ MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc));
+
+ if (const MachineOperand *TFE
+ = getNamedOperand(*MI, AMDGPU::OpName::tfe)) {
+ MIB.addImm(TFE->getImm());
+ }
+
+ MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ Addr64 = MIB;
+ } else {
+ // Atomics with return.
+ Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
+ .addOperand(*VData)
+ .addOperand(*VDataIn)
+ .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
+ // This will be replaced later
+ // with the new value of vaddr.
+ .addOperand(*SRsrc)
+ .addOperand(*SOffset)
+ .addOperand(*Offset)
+ .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc))
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ }
MI->removeFromParent();
MI = Addr64;
- NewVAddrLo = SRsrcPtrLo;
- NewVAddrHi = SRsrcPtrHi;
+ // NewVaddr = {NewVaddrHi, NewVaddrLo}
+ BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
+ .addReg(SRsrcPtr, 0, AMDGPU::sub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(SRsrcPtr, 0, AMDGPU::sub1)
+ .addImm(AMDGPU::sub1);
+
VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc);
}
- // NewVaddr = {NewVaddrHi, NewVaddrLo}
- BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
- NewVAddr)
- .addReg(NewVAddrLo)
- .addImm(AMDGPU::sub0)
- .addReg(NewVAddrHi)
- .addImm(AMDGPU::sub1);
-
-
// Update the instruction to use NewVaddr
VAddr->setReg(NewVAddr);
// Update the instruction to use NewSRsrc
@@ -2028,53 +2287,64 @@ void SIInstrInfo::splitSMRD(MachineInstr *MI,
.addOperand(*SOff);
unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR)
- .addOperand(*SOff)
- .addImm(HalfSize);
- Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp))
+ .addReg(SOff->getReg(), 0, SOff->getSubReg())
+ .addImm(HalfSize);
+ Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
.addReg(SBase->getReg(), getKillRegState(IsKill),
SBase->getSubReg())
.addReg(OffsetSGPR);
}
unsigned SubLo, SubHi;
+ const TargetRegisterClass *NewDstRC;
switch (HalfSize) {
case 4:
SubLo = AMDGPU::sub0;
SubHi = AMDGPU::sub1;
+ NewDstRC = &AMDGPU::VReg_64RegClass;
break;
case 8:
SubLo = AMDGPU::sub0_sub1;
SubHi = AMDGPU::sub2_sub3;
+ NewDstRC = &AMDGPU::VReg_128RegClass;
break;
case 16:
SubLo = AMDGPU::sub0_sub1_sub2_sub3;
SubHi = AMDGPU::sub4_sub5_sub6_sub7;
+ NewDstRC = &AMDGPU::VReg_256RegClass;
break;
case 32:
SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15;
+ NewDstRC = &AMDGPU::VReg_512RegClass;
break;
default:
llvm_unreachable("Unhandled HalfSize");
}
- BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE))
- .addOperand(MI->getOperand(0))
- .addReg(RegLo)
- .addImm(SubLo)
- .addReg(RegHi)
- .addImm(SubHi);
+ unsigned OldDst = MI->getOperand(0).getReg();
+ unsigned NewDst = MRI.createVirtualRegister(NewDstRC);
+
+ MRI.replaceRegWith(OldDst, NewDst);
+
+ BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDst)
+ .addReg(RegLo)
+ .addImm(SubLo)
+ .addReg(RegHi)
+ .addImm(SubHi);
}
-void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const {
+void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI,
+ MachineRegisterInfo &MRI,
+ SmallVectorImpl<MachineInstr *> &Worklist) const {
MachineBasicBlock *MBB = MI->getParent();
- switch (MI->getOpcode()) {
- case AMDGPU::S_LOAD_DWORD_IMM:
- case AMDGPU::S_LOAD_DWORD_SGPR:
- case AMDGPU::S_LOAD_DWORDX2_IMM:
- case AMDGPU::S_LOAD_DWORDX2_SGPR:
- case AMDGPU::S_LOAD_DWORDX4_IMM:
- case AMDGPU::S_LOAD_DWORDX4_SGPR: {
+ int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+ assert(DstIdx != -1);
+ unsigned DstRCID = get(MI->getOpcode()).OpInfo[DstIdx].RegClass;
+ switch(RI.getRegClass(DstRCID)->getSize()) {
+ case 4:
+ case 8:
+ case 16: {
unsigned NewOpcode = getVALUOp(*MI);
unsigned RegOffset;
unsigned ImmOffset;
@@ -2118,53 +2388,55 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
.addImm(RsrcDataFormat >> 32);
BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
- .addReg(DWord0)
- .addImm(AMDGPU::sub0)
- .addReg(DWord1)
- .addImm(AMDGPU::sub1)
- .addReg(DWord2)
- .addImm(AMDGPU::sub2)
- .addReg(DWord3)
- .addImm(AMDGPU::sub3);
- MI->setDesc(get(NewOpcode));
- if (MI->getOperand(2).isReg()) {
- MI->getOperand(2).setReg(SRsrc);
- } else {
- MI->getOperand(2).ChangeToRegister(SRsrc, false);
- }
- MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0));
- MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset));
- MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // glc
- MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // slc
- MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // tfe
-
- const TargetRegisterClass *NewDstRC =
- RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass);
-
- unsigned DstReg = MI->getOperand(0).getReg();
+ .addReg(DWord0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DWord1)
+ .addImm(AMDGPU::sub1)
+ .addReg(DWord2)
+ .addImm(AMDGPU::sub2)
+ .addReg(DWord3)
+ .addImm(AMDGPU::sub3);
+
+ const MCInstrDesc &NewInstDesc = get(NewOpcode);
+ const TargetRegisterClass *NewDstRC
+ = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass);
unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
+ unsigned DstReg = MI->getOperand(0).getReg();
MRI.replaceRegWith(DstReg, NewDstReg);
+
+ MachineInstr *NewInst =
+ BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg)
+ .addOperand(MI->getOperand(1)) // sbase
+ .addReg(SRsrc)
+ .addImm(0)
+ .addImm(ImmOffset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+ MI->eraseFromParent();
+
+ legalizeOperands(NewInst);
+ addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
break;
}
- case AMDGPU::S_LOAD_DWORDX8_IMM:
- case AMDGPU::S_LOAD_DWORDX8_SGPR: {
+ case 32: {
MachineInstr *Lo, *Hi;
splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM,
AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi);
MI->eraseFromParent();
- moveSMRDToVALU(Lo, MRI);
- moveSMRDToVALU(Hi, MRI);
+ moveSMRDToVALU(Lo, MRI, Worklist);
+ moveSMRDToVALU(Hi, MRI, Worklist);
break;
}
- case AMDGPU::S_LOAD_DWORDX16_IMM:
- case AMDGPU::S_LOAD_DWORDX16_SGPR: {
+ case 64: {
MachineInstr *Lo, *Hi;
splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM,
AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi);
MI->eraseFromParent();
- moveSMRDToVALU(Lo, MRI);
- moveSMRDToVALU(Hi, MRI);
+ moveSMRDToVALU(Lo, MRI, Worklist);
+ moveSMRDToVALU(Hi, MRI, Worklist);
break;
}
}
@@ -2185,51 +2457,28 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
// Handle some special cases
switch (Opcode) {
default:
- if (isSMRD(Inst->getOpcode())) {
- moveSMRDToVALU(Inst, MRI);
+ if (isSMRD(*Inst)) {
+ moveSMRDToVALU(Inst, MRI, Worklist);
+ continue;
}
break;
- case AMDGPU::S_MOV_B64: {
- DebugLoc DL = Inst->getDebugLoc();
-
- // If the source operand is a register we can replace this with a
- // copy.
- if (Inst->getOperand(1).isReg()) {
- MachineInstr *Copy = BuildMI(*MBB, Inst, DL, get(TargetOpcode::COPY))
- .addOperand(Inst->getOperand(0))
- .addOperand(Inst->getOperand(1));
- Worklist.push_back(Copy);
- } else {
- // Otherwise, we need to split this into two movs, because there is
- // no 64-bit VALU move instruction.
- unsigned Reg = Inst->getOperand(0).getReg();
- unsigned Dst = split64BitImm(Worklist,
- Inst,
- MRI,
- MRI.getRegClass(Reg),
- Inst->getOperand(1));
- MRI.replaceRegWith(Reg, Dst);
- }
- Inst->eraseFromParent();
- continue;
- }
case AMDGPU::S_AND_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
Inst->eraseFromParent();
continue;
case AMDGPU::S_OR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
Inst->eraseFromParent();
continue;
case AMDGPU::S_XOR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32);
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
Inst->eraseFromParent();
continue;
case AMDGPU::S_NOT_B64:
- splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
+ splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
Inst->eraseFromParent();
continue;
@@ -2281,6 +2530,11 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
}
break;
+ case AMDGPU::S_ABS_I32:
+ lowerScalarAbs(Worklist, Inst);
+ Inst->eraseFromParent();
+ continue;
+
case AMDGPU::S_BFE_U64:
case AMDGPU::S_BFM_B64:
llvm_unreachable("Moving this op to VALU not implemented");
@@ -2319,7 +2573,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
Inst->addOperand(MachineOperand::CreateImm(0));
}
- addDescImplicitUseDef(NewDesc, Inst);
+ Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent());
if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
@@ -2337,27 +2591,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
}
// Update the destination register class.
-
- const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);
-
- switch (Opcode) {
- // For target instructions, getOpRegClass just returns the virtual
- // register class associated with the operand, so we need to find an
- // equivalent VGPR register class in order to move the instruction to the
- // VALU.
- case AMDGPU::COPY:
- case AMDGPU::PHI:
- case AMDGPU::REG_SEQUENCE:
- case AMDGPU::INSERT_SUBREG:
- if (RI.hasVGPRs(NewDstRC))
- continue;
- NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
- if (!NewDstRC)
- continue;
- break;
- default:
- break;
- }
+ const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst);
+ if (!NewDstRC)
+ continue;
unsigned DstReg = Inst->getOperand(0).getReg();
unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
@@ -2366,13 +2602,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
// Legalize the operands
legalizeOperands(Inst);
- for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
- E = MRI.use_end(); I != E; ++I) {
- MachineInstr &UseMI = *I->getParent();
- if (!canReadVGPR(UseMI, I.getOperandNo())) {
- Worklist.push_back(&UseMI);
- }
- }
+ addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
}
}
@@ -2390,6 +2620,30 @@ const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
return &AMDGPU::VGPR_32RegClass;
}
+void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineInstr *Inst) const {
+ MachineBasicBlock &MBB = *Inst->getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineBasicBlock::iterator MII = Inst;
+ DebugLoc DL = Inst->getDebugLoc();
+
+ MachineOperand &Dest = Inst->getOperand(0);
+ MachineOperand &Src = Inst->getOperand(1);
+ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg)
+ .addImm(0)
+ .addReg(Src.getReg());
+
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
+ .addReg(Src.getReg())
+ .addReg(TmpReg);
+
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
void SIInstrInfo::splitScalar64BitUnaryOp(
SmallVectorImpl<MachineInstr *> &Worklist,
MachineInstr *Inst,
@@ -2414,20 +2668,21 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
AMDGPU::sub0, Src0SubRC);
const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
- const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
+ const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
+ const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
- unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
- MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
+ unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
+ BuildMI(MBB, MII, DL, InstDesc, DestSub0)
.addOperand(SrcReg0Sub0);
MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
AMDGPU::sub1, Src0SubRC);
- unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
- MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
+ unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
+ BuildMI(MBB, MII, DL, InstDesc, DestSub1)
.addOperand(SrcReg0Sub1);
- unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
+ unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
.addReg(DestSub0)
.addImm(AMDGPU::sub0)
@@ -2436,10 +2691,11 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
MRI.replaceRegWith(Dest.getReg(), FullDestReg);
- // Try to legalize the operands in case we need to swap the order to keep it
- // valid.
- Worklist.push_back(LoHalf);
- Worklist.push_back(HiHalf);
+ // We don't need to legalizeOperands here because for a single operand, src0
+ // will support any kind of input.
+
+ // Move all users of this moved value.
+ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
void SIInstrInfo::splitScalar64BitBinaryOp(
@@ -2474,9 +2730,10 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
AMDGPU::sub0, Src1SubRC);
const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
- const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
+ const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
+ const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
- unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
+ unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
.addOperand(SrcReg0Sub0)
.addOperand(SrcReg1Sub0);
@@ -2486,12 +2743,12 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
AMDGPU::sub1, Src1SubRC);
- unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
+ unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
.addOperand(SrcReg0Sub1)
.addOperand(SrcReg1Sub1);
- unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
+ unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
.addReg(DestSub0)
.addImm(AMDGPU::sub0)
@@ -2502,8 +2759,11 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
// Try to legalize the operands in case we need to swap the order to keep it
// valid.
- Worklist.push_back(LoHalf);
- Worklist.push_back(HiHalf);
+ legalizeOperands(LoHalf);
+ legalizeOperands(HiHalf);
+
+ // Move all users of this moved vlaue.
+ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
@@ -2532,18 +2792,19 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist
MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
AMDGPU::sub1, SrcSubRC);
- MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg)
+ BuildMI(MBB, MII, DL, InstDesc, MidReg)
.addOperand(SrcRegSub0)
.addImm(0);
- MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg)
+ BuildMI(MBB, MII, DL, InstDesc, ResultReg)
.addOperand(SrcRegSub1)
.addReg(MidReg);
MRI.replaceRegWith(Dest.getReg(), ResultReg);
- Worklist.push_back(First);
- Worklist.push_back(Second);
+ // We don't need to legalize operands here. src0 for etiher instruction can be
+ // an SGPR, and the second input is unused or determined here.
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
@@ -2587,6 +2848,7 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
.addImm(AMDGPU::sub1);
MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
return;
}
@@ -2605,33 +2867,53 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
.addImm(AMDGPU::sub1);
MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
-void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
- MachineInstr *Inst) const {
- // Add the implict and explicit register definitions.
- if (NewDesc.ImplicitUses) {
- for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) {
- unsigned Reg = NewDesc.ImplicitUses[i];
- Inst->addOperand(MachineOperand::CreateReg(Reg, false, true));
+void SIInstrInfo::addUsersToMoveToVALUWorklist(
+ unsigned DstReg,
+ MachineRegisterInfo &MRI,
+ SmallVectorImpl<MachineInstr *> &Worklist) const {
+ for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
+ E = MRI.use_end(); I != E; ++I) {
+ MachineInstr &UseMI = *I->getParent();
+ if (!canReadVGPR(UseMI, I.getOperandNo())) {
+ Worklist.push_back(&UseMI);
}
}
+}
- if (NewDesc.ImplicitDefs) {
- for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) {
- unsigned Reg = NewDesc.ImplicitDefs[i];
- Inst->addOperand(MachineOperand::CreateReg(Reg, true, true));
- }
+const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
+ const MachineInstr &Inst) const {
+ const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
+
+ switch (Inst.getOpcode()) {
+ // For target instructions, getOpRegClass just returns the virtual register
+ // class associated with the operand, so we need to find an equivalent VGPR
+ // register class in order to move the instruction to the VALU.
+ case AMDGPU::COPY:
+ case AMDGPU::PHI:
+ case AMDGPU::REG_SEQUENCE:
+ case AMDGPU::INSERT_SUBREG:
+ if (RI.hasVGPRs(NewDstRC))
+ return nullptr;
+
+ NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
+ if (!NewDstRC)
+ return nullptr;
+ return NewDstRC;
+ default:
+ return NewDstRC;
}
}
+// Find the one SGPR operand we are allowed to use.
unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
int OpIndices[3]) const {
- const MCInstrDesc &Desc = get(MI->getOpcode());
+ const MCInstrDesc &Desc = MI->getDesc();
// Find the one SGPR operand we are allowed to use.
- unsigned SGPRReg = AMDGPU::NoRegister;
-
+ //
// First we need to consider the instruction's operand requirements before
// legalizing. Some operands are required to be SGPRs, such as implicit uses
// of VCC, but we are still bound by the constant bus requirement to only use
@@ -2639,17 +2921,9 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
//
// If the operand's class is an SGPR, we can never move it.
- for (const MachineOperand &MO : MI->implicit_operands()) {
- // We only care about reads.
- if (MO.isDef())
- continue;
-
- if (MO.getReg() == AMDGPU::VCC)
- return AMDGPU::VCC;
-
- if (MO.getReg() == AMDGPU::FLAT_SCR)
- return AMDGPU::FLAT_SCR;
- }
+ unsigned SGPRReg = findImplicitSGPRRead(*MI);
+ if (SGPRReg != AMDGPU::NoRegister)
+ return SGPRReg;
unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
@@ -2660,15 +2934,22 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
break;
const MachineOperand &MO = MI->getOperand(Idx);
- if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass))
- SGPRReg = MO.getReg();
+ if (!MO.isReg())
+ continue;
- if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
- UsedSGPRs[i] = MO.getReg();
- }
+ // Is this operand statically required to be an SGPR based on the operand
+ // constraints?
+ const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
+ bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
+ if (IsRequiredSGPR)
+ return MO.getReg();
- if (SGPRReg != AMDGPU::NoRegister)
- return SGPRReg;
+ // If this could be a VGPR or an SGPR, Check the dynamic register class.
+ unsigned Reg = MO.getReg();
+ const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
+ if (RI.isSGPRClass(RegRC))
+ UsedSGPRs[i] = Reg;
+ }
// We don't have a required SGPR operand, so we have a bit more freedom in
// selecting operands to move.
@@ -2680,6 +2961,9 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
// V_FMA_F32 v0, s0, s0, s0 -> No moves
// V_FMA_F32 v0, s0, s1, s0 -> Move s1
+ // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
+ // prefer those.
+
if (UsedSGPRs[0] != AMDGPU::NoRegister) {
if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
SGPRReg = UsedSGPRs[0];
@@ -2720,7 +3004,7 @@ MachineInstrBuilder SIInstrInfo::buildIndirectRead(
unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister(
getIndirectIndexBegin(*MBB->getParent()));
- return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC))
+ return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC_V1))
.addOperand(I->getOperand(0))
.addOperand(I->getOperand(1))
.addReg(IndirectBaseReg)
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 5053786..307ef67 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -39,14 +39,11 @@ private:
unsigned SubIdx,
const TargetRegisterClass *SubRC) const;
- unsigned split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
- MachineBasicBlock::iterator MI,
- MachineRegisterInfo &MRI,
- const TargetRegisterClass *RC,
- const MachineOperand &Op) const;
-
void swapOperands(MachineBasicBlock::iterator Inst) const;
+ void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineInstr *Inst) const;
+
void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
MachineInstr *Inst, unsigned Opcode) const;
@@ -58,13 +55,24 @@ private:
void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
MachineInstr *Inst) const;
- void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
+ void addUsersToMoveToVALUWorklist(
+ unsigned Reg, MachineRegisterInfo &MRI,
+ SmallVectorImpl<MachineInstr *> &Worklist) const;
+
+ const TargetRegisterClass *
+ getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
MachineInstr *MIb) const;
unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const;
+protected:
+ MachineInstr *commuteInstructionImpl(MachineInstr *MI,
+ bool NewMI,
+ unsigned OpIdx0,
+ unsigned OpIdx1) const override;
+
public:
explicit SIInstrInfo(const AMDGPUSubtarget &st);
@@ -117,17 +125,14 @@ public:
// register. If there is no hardware instruction that can store to \p
// DstRC, then AMDGPU::COPY is returned.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
+
+ LLVM_READONLY
int commuteOpcode(const MachineInstr &MI) const;
- MachineInstr *commuteInstruction(MachineInstr *MI,
- bool NewMI = false) const override;
bool findCommutedOpIndices(MachineInstr *MI,
unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;
- bool isTriviallyReMaterializable(const MachineInstr *MI,
- AliasAnalysis *AA = nullptr) const;
-
bool areMemAccessesTriviallyDisjoint(
MachineInstr *MIa, MachineInstr *MIb,
AliasAnalysis *AA = nullptr) const override;
@@ -137,8 +142,6 @@ public:
unsigned DstReg, unsigned SrcReg) const override;
bool isMov(unsigned Opcode) const override;
- bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
-
bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
unsigned Reg, MachineRegisterInfo *MRI) const final;
@@ -148,78 +151,154 @@ public:
MachineBasicBlock::iterator &MI,
LiveVariables *LV) const override;
+ static bool isSALU(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SALU;
+ }
+
bool isSALU(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SALU;
}
+ static bool isVALU(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VALU;
+ }
+
bool isVALU(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VALU;
}
+ static bool isSOP1(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SOP1;
+ }
+
bool isSOP1(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SOP1;
}
+ static bool isSOP2(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SOP2;
+ }
+
bool isSOP2(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SOP2;
}
+ static bool isSOPC(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SOPC;
+ }
+
bool isSOPC(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SOPC;
}
+ static bool isSOPK(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SOPK;
+ }
+
bool isSOPK(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SOPK;
}
+ static bool isSOPP(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SOPP;
+ }
+
bool isSOPP(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SOPP;
}
+ static bool isVOP1(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VOP1;
+ }
+
bool isVOP1(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VOP1;
}
+ static bool isVOP2(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VOP2;
+ }
+
bool isVOP2(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VOP2;
}
+ static bool isVOP3(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VOP3;
+ }
+
bool isVOP3(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VOP3;
}
+ static bool isVOPC(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VOPC;
+ }
+
bool isVOPC(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VOPC;
}
+ static bool isMUBUF(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::MUBUF;
+ }
+
bool isMUBUF(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::MUBUF;
}
+ static bool isMTBUF(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::MTBUF;
+ }
+
bool isMTBUF(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
}
+ static bool isSMRD(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::SMRD;
+ }
+
bool isSMRD(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::SMRD;
}
+ static bool isDS(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::DS;
+ }
+
bool isDS(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::DS;
}
+ static bool isMIMG(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::MIMG;
+ }
+
bool isMIMG(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::MIMG;
}
+ static bool isFLAT(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::FLAT;
+ }
+
bool isFLAT(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::FLAT;
}
+ static bool isWQM(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::WQM;
+ }
+
bool isWQM(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::WQM;
}
+ static bool isVGPRSpill(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill;
+ }
+
bool isVGPRSpill(uint16_t Opcode) const {
return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill;
}
@@ -302,6 +381,26 @@ public:
bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
const MachineOperand *MO = nullptr) const;
+ /// \brief Check if \p MO would be a valid operand for the given operand
+ /// definition \p OpInfo. Note this does not attempt to validate constant bus
+ /// restrictions (e.g. literal constant usage).
+ bool isLegalVSrcOperand(const MachineRegisterInfo &MRI,
+ const MCOperandInfo &OpInfo,
+ const MachineOperand &MO) const;
+
+ /// \brief Check if \p MO (a register operand) is a legal register for the
+ /// given operand description.
+ bool isLegalRegOperand(const MachineRegisterInfo &MRI,
+ const MCOperandInfo &OpInfo,
+ const MachineOperand &MO) const;
+
+ /// \brief Legalize operands in \p MI by either commuting it or inserting a
+ /// copy of src1.
+ void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr *MI) const;
+
+ /// \brief Fix operands in \p MI to satisfy constant bus requirements.
+ void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr *MI) const;
+
/// \brief Legalize all operands in this instruction. This function may
/// create new instruction and insert them before \p MI.
void legalizeOperands(MachineInstr *MI) const;
@@ -312,7 +411,8 @@ public:
unsigned HalfImmOp, unsigned HalfSGPROp,
MachineInstr *&Lo, MachineInstr *&Hi) const;
- void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const;
+ void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI,
+ SmallVectorImpl<MachineInstr *> &Worklist) const;
/// \brief Replace this instruction's opcode with the equivalent VALU
/// opcode. This function will also move the users of \p MI to the
@@ -341,29 +441,49 @@ public:
void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I,
unsigned SavReg, unsigned IndexReg) const;
- void insertNOPs(MachineBasicBlock::iterator MI, int Count) const;
+ void insertWaitStates(MachineBasicBlock::iterator MI, int Count) const;
/// \brief Returns the operand named \p Op. If \p MI does not have an
/// operand named \c Op, this function returns nullptr.
+ LLVM_READONLY
MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const;
+ LLVM_READONLY
const MachineOperand *getNamedOperand(const MachineInstr &MI,
unsigned OpName) const {
return getNamedOperand(const_cast<MachineInstr &>(MI), OpName);
}
+ /// Get required immediate operand
+ int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const {
+ int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
+ return MI.getOperand(Idx).getImm();
+ }
+
uint64_t getDefaultRsrcDataFormat() const;
uint64_t getScratchRsrcWords23() const;
};
namespace AMDGPU {
-
+ LLVM_READONLY
int getVOPe64(uint16_t Opcode);
+
+ LLVM_READONLY
int getVOPe32(uint16_t Opcode);
+
+ LLVM_READONLY
int getCommuteRev(uint16_t Opcode);
+
+ LLVM_READONLY
int getCommuteOrig(uint16_t Opcode);
+
+ LLVM_READONLY
int getAddr64Inst(uint16_t Opcode);
+
+ LLVM_READONLY
int getAtomicRetOp(uint16_t Opcode);
+
+ LLVM_READONLY
int getAtomicNoRetOp(uint16_t Opcode);
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 8d8110b..10f2adde 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -8,9 +8,9 @@
//===----------------------------------------------------------------------===//
def isCI : Predicate<"Subtarget->getGeneration() "
">= AMDGPUSubtarget::SEA_ISLANDS">;
-def isVI : Predicate <
- "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
- AssemblerPredicate<"FeatureGCN3Encoding">;
+def isCIOnly : Predicate<"Subtarget->getGeneration() =="
+ "AMDGPUSubtarget::SEA_ISLANDS">,
+ AssemblerPredicate <"FeatureSeaIslands">;
def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
@@ -69,6 +69,15 @@ class sopk <bits<5> si, bits<5> vi = si> {
field bits<5> VI = vi;
}
+// Specify an SMRD opcode for SI and SMEM opcode for VI
+
+// FIXME: This should really be bits<5> si, Tablegen crashes if
+// parameter default value is other parameter with different bit size
+class smrd<bits<8> si, bits<8> vi = si> {
+ field bits<5> SI = si{4-0};
+ field bits<8> VI = vi;
+}
+
// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum
// in AMDGPUInstrInfo.cpp
def SISubtarget {
@@ -121,9 +130,20 @@ def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
def SIconstdata_ptr : SDNode<
- "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]>
+ "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, i64>,
+ SDTCisVT<0, i64>]>
>;
+def mubuf_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
+ return isGlobalLoad(cast<LoadSDNode>(N)) ||
+ isConstantLoad(cast<LoadSDNode>(N), -1);
+}]>;
+
+def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
+ return isConstantLoad(cast<LoadSDNode>(N), -1) &&
+ static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N);
+}]>;
+
//===----------------------------------------------------------------------===//
// SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1
// to be glued to the memory instructions.
@@ -328,9 +348,9 @@ class SGPRImm <dag frag> : PatLeaf<frag, [{
static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
U != E; ++U) {
- if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) {
+ const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
+ if (RC && SIRI->isSGPRClass(RC))
return true;
- }
}
return false;
}]>;
@@ -354,6 +374,8 @@ def sopp_brtarget : Operand<OtherVT> {
let ParserMatchClass = SoppBrTarget;
}
+def const_ga : Operand<iPTR>;
+
include "SIInstrFormats.td"
include "VIInstrFormats.td"
@@ -393,7 +415,7 @@ def GDS01MatchClass : GDSBaseMatchClass <"parseDSOff01OptionalOps">;
class GLCBaseMatchClass <string parser> : AsmOperandClass {
let Name = "GLC"#parser;
let PredicateMethod = "isImm";
- let ParserMethod = parser;
+ let ParserMethod = parser;
let RenderMethod = "addImmOperands";
}
@@ -436,6 +458,17 @@ def ClampMatchClass : AsmOperandClass {
let RenderMethod = "addImmOperands";
}
+class SMRDOffsetBaseMatchClass <string predicate> : AsmOperandClass {
+ let Name = "SMRDOffset"#predicate;
+ let PredicateMethod = predicate;
+ let RenderMethod = "addImmOperands";
+}
+
+def SMRDOffsetMatchClass : SMRDOffsetBaseMatchClass <"isSMRDOffset">;
+def SMRDLiteralOffsetMatchClass : SMRDOffsetBaseMatchClass <
+ "isSMRDLiteralOffset"
+>;
+
let OperandType = "OPERAND_IMMEDIATE" in {
def offen : Operand<i1> {
@@ -510,6 +543,16 @@ def ClampMod : Operand <i1> {
let ParserMatchClass = ClampMatchClass;
}
+def smrd_offset : Operand <i32> {
+ let PrintMethod = "printU32ImmOperand";
+ let ParserMatchClass = SMRDOffsetMatchClass;
+}
+
+def smrd_literal_offset : Operand <i32> {
+ let PrintMethod = "printU32ImmOperand";
+ let ParserMatchClass = SMRDLiteralOffsetMatchClass;
+}
+
} // End OperandType = "OPERAND_IMMEDIATE"
def VOPDstS64 : VOPDstOperand <SReg_64>;
@@ -528,6 +571,13 @@ def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
+def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">;
+def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">;
+def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">;
+def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
+def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
+def SMRDBufferSgpr : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">;
+
def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">;
def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
@@ -717,19 +767,6 @@ class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> :
let AssemblerPredicates = [isVI];
}
-multiclass SOP2_SELECT_32 <sop2 op, string opName, list<dag> pattern> {
- def "" : SOP2_Pseudo <opName, (outs SReg_32:$dst),
- (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), pattern>;
-
- def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst),
- (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
- opName#" $dst, $src0, $src1 [$scc]">;
-
- def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst),
- (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
- opName#" $dst, $src0, $src1 [$scc]">;
-}
-
multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm,
list<dag> pattern> {
@@ -758,8 +795,10 @@ multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m <
class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
string opName, PatLeaf cond> : SOPC <
- op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1),
- opName#" $src0, $src1", []>;
+ op, (outs), (ins rc:$src0, rc:$src1),
+ opName#" $src0, $src1", []> {
+ let Defs = [SCC];
+}
class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL>
: SOPC_Helper<op, SSrc_32, i32, opName, cond>;
@@ -812,15 +851,20 @@ multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> {
}
multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> {
- def "" : SOPK_Pseudo <opName, (outs SCCReg:$dst),
- (ins SReg_32:$src0, u16imm:$src1), pattern>;
+ def "" : SOPK_Pseudo <opName, (outs),
+ (ins SReg_32:$src0, u16imm:$src1), pattern> {
+ let Defs = [SCC];
+ }
+
- let DisableEncoding = "$dst" in {
- def _si : SOPK_Real_si <op, opName, (outs SCCReg:$dst),
- (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">;
+ def _si : SOPK_Real_si <op, opName, (outs),
+ (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> {
+ let Defs = [SCC];
+ }
- def _vi : SOPK_Real_vi <op, opName, (outs SCCReg:$dst),
- (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">;
+ def _vi : SOPK_Real_vi <op, opName, (outs),
+ (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> {
+ let Defs = [SCC];
}
}
@@ -868,35 +912,68 @@ class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins,
}
class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins,
- string asm> :
- SMRD <outs, ins, asm, []>,
+ string asm, list<dag> pattern = []> :
+ SMRD <outs, ins, asm, pattern>,
SMEMe_vi <op, imm>,
SIMCInstr<opName, SISubtarget.VI> {
let AssemblerPredicates = [isVI];
}
-multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins,
+multiclass SMRD_m <smrd op, string opName, bit imm, dag outs, dag ins,
string asm, list<dag> pattern> {
def "" : SMRD_Pseudo <opName, outs, ins, pattern>;
- def _si : SMRD_Real_si <op, opName, imm, outs, ins, asm>;
+ def _si : SMRD_Real_si <op.SI, opName, imm, outs, ins, asm>;
// glc is only applicable to scalar stores, which are not yet
// implemented.
let glc = 0 in {
- def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>;
+ def _vi : SMRD_Real_vi <op.VI, opName, imm, outs, ins, asm>;
}
}
-multiclass SMRD_Helper <bits<5> op, string opName, RegisterClass baseClass,
+multiclass SMRD_Inval <smrd op, string opName,
+ SDPatternOperator node> {
+ let hasSideEffects = 1, mayStore = 1 in {
+ def "" : SMRD_Pseudo <opName, (outs), (ins), [(node)]>;
+
+ let sbase = 0, offset = 0 in {
+ let sdst = 0 in {
+ def _si : SMRD_Real_si <op.SI, opName, 0, (outs), (ins), opName>;
+ }
+
+ let glc = 0, sdata = 0 in {
+ def _vi : SMRD_Real_vi <op.VI, opName, 0, (outs), (ins), opName>;
+ }
+ }
+ }
+}
+
+class SMEM_Inval <bits<8> op, string opName, SDPatternOperator node> :
+ SMRD_Real_vi<op, opName, 0, (outs), (ins), opName, [(node)]> {
+ let hasSideEffects = 1;
+ let mayStore = 1;
+ let sbase = 0;
+ let sdata = 0;
+ let glc = 0;
+ let offset = 0;
+}
+
+multiclass SMRD_Helper <smrd op, string opName, RegisterClass baseClass,
RegisterClass dstClass> {
defm _IMM : SMRD_m <
op, opName#"_IMM", 1, (outs dstClass:$dst),
- (ins baseClass:$sbase, u32imm:$offset),
+ (ins baseClass:$sbase, smrd_offset:$offset),
opName#" $dst, $sbase, $offset", []
>;
+ def _IMM_ci : SMRD <
+ (outs dstClass:$dst), (ins baseClass:$sbase, smrd_literal_offset:$offset),
+ opName#" $dst, $sbase, $offset", []>, SMRD_IMMe_ci <op.SI> {
+ let AssemblerPredicates = [isCIOnly];
+ }
+
defm _SGPR : SMRD_m <
op, opName#"_SGPR", 0, (outs dstClass:$dst),
(ins baseClass:$sbase, SReg_32:$soff),
@@ -922,11 +999,12 @@ def InputModsNoDefault : Operand <i32> {
let ParserMatchClass = InputModsMatchClass;
}
-class getNumSrcArgs<ValueType Src1, ValueType Src2> {
+class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> {
int ret =
- !if (!eq(Src1.Value, untyped.Value), 1, // VOP1
+ !if (!eq(Src0.Value, untyped.Value), 0,
+ !if (!eq(Src1.Value, untyped.Value), 1, // VOP1
!if (!eq(Src2.Value, untyped.Value), 2, // VOP2
- 3)); // VOP3
+ 3))); // VOP3
}
// Returns the register class to use for the destination of VOP[123C]
@@ -934,28 +1012,37 @@ class getNumSrcArgs<ValueType Src1, ValueType Src2> {
class getVALUDstForVT<ValueType VT> {
RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>,
!if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
- VOPDstOperand<SReg_64>)); // else VT == i1
+ !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>,
+ VOPDstOperand<SReg_64>))); // else VT == i1
}
// Returns the register class to use for source 0 of VOP[12C]
// instructions for the given VT.
class getVOPSrc0ForVT<ValueType VT> {
- RegisterOperand ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64);
+ RegisterOperand ret = !if(!eq(VT.Size, 64), VSrc_64, VSrc_32);
}
// Returns the register class to use for source 1 of VOP[12C] for the
// given VT.
class getVOPSrc1ForVT<ValueType VT> {
- RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, VReg_64);
+ RegisterClass ret = !if(!eq(VT.Size, 64), VReg_64, VGPR_32);
}
// Returns the register class to use for sources of VOP3 instructions for the
// given VT.
class getVOP3SrcForVT<ValueType VT> {
- RegisterOperand ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64);
+ RegisterOperand ret =
+ !if(!eq(VT.Size, 64),
+ VCSrc_64,
+ !if(!eq(VT.Value, i1.Value),
+ SCSrc_64,
+ VCSrc_32
+ )
+ );
}
// Returns 1 if the source arguments have modifiers, 0 if they do not.
+// XXX - do f16 instructions?
class hasModifiers<ValueType SrcVT> {
bit ret = !if(!eq(SrcVT.Value, f32.Value), 1,
!if(!eq(SrcVT.Value, f64.Value), 1, 0));
@@ -1009,17 +1096,20 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
// Returns the assembly string for the inputs and outputs of a VOP[12C]
// instruction. This does not add the _e32 suffix, so it can be reused
// by getAsm64.
-class getAsm32 <int NumSrcArgs> {
+class getAsm32 <bit HasDst, int NumSrcArgs> {
+ string dst = "$dst";
+ string src0 = ", $src0";
string src1 = ", $src1";
string src2 = ", $src2";
- string ret = "$dst, $src0"#
- !if(!eq(NumSrcArgs, 1), "", src1)#
- !if(!eq(NumSrcArgs, 3), src2, "");
+ string ret = !if(HasDst, dst, "") #
+ !if(!eq(NumSrcArgs, 1), src0, "") #
+ !if(!eq(NumSrcArgs, 2), src0#src1, "") #
+ !if(!eq(NumSrcArgs, 3), src0#src1#src2, "");
}
// Returns the assembly string for the inputs and outputs of a VOP3
// instruction.
-class getAsm64 <int NumSrcArgs, bit HasModifiers> {
+class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers> {
string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
string src1 = !if(!eq(NumSrcArgs, 1), "",
!if(!eq(NumSrcArgs, 2), " $src1_modifiers",
@@ -1027,11 +1117,10 @@ class getAsm64 <int NumSrcArgs, bit HasModifiers> {
string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
string ret =
!if(!eq(HasModifiers, 0),
- getAsm32<NumSrcArgs>.ret,
+ getAsm32<HasDst, NumSrcArgs>.ret,
"$dst, "#src0#src1#src2#"$clamp"#"$omod");
}
-
class VOPProfile <list<ValueType> _ArgVT> {
field list<ValueType> ArgVT = _ArgVT;
@@ -1047,29 +1136,38 @@ class VOPProfile <list<ValueType> _ArgVT> {
field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
- field int NumSrcArgs = getNumSrcArgs<Src1VT, Src2VT>.ret;
+ field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
+ field bit HasDst32 = HasDst;
+ field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret;
field bit HasModifiers = hasModifiers<Src0VT>.ret;
- field dag Outs = (outs DstRC:$dst);
+ field dag Outs = !if(HasDst,(outs DstRC:$dst),(outs));
+
+ // VOP3b instructions are a special case with a second explicit
+ // output. This is manually overridden for them.
+ field dag Outs32 = Outs;
+ field dag Outs64 = Outs;
field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
HasModifiers>.ret;
- field string Asm32 = getAsm32<NumSrcArgs>.ret;
- field string Asm64 = getAsm64<NumSrcArgs, HasModifiers>.ret;
+ field string Asm32 = getAsm32<HasDst, NumSrcArgs>.ret;
+ field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers>.ret;
}
// FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order
// for the instruction patterns to work.
-def VOP_F16_F16 : VOPProfile <[f32, f32, untyped, untyped]>;
-def VOP_F16_I16 : VOPProfile <[f32, i32, untyped, untyped]>;
-def VOP_I16_F16 : VOPProfile <[i32, f32, untyped, untyped]>;
+def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>;
+def VOP_F16_I16 : VOPProfile <[f16, i32, untyped, untyped]>;
+def VOP_I16_F16 : VOPProfile <[i32, f16, untyped, untyped]>;
-def VOP_F16_F16_F16 : VOPProfile <[f32, f32, f32, untyped]>;
-def VOP_F16_F16_I16 : VOPProfile <[f32, f32, i32, untyped]>;
+def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
+def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i32, untyped]>;
def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>;
+def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>;
+
def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>;
def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>;
def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>;
@@ -1087,25 +1185,76 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
-def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> {
+
+// Write out to vcc or arbitrary SGPR.
+def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {
+ let Asm32 = "$dst, vcc, $src0, $src1";
+ let Asm64 = "$dst, $sdst, $src0, $src1";
+ let Outs32 = (outs DstRC:$dst);
+ let Outs64 = (outs DstRC:$dst, SReg_64:$sdst);
+}
+
+// Write out to vcc or arbitrary SGPR and read in from vcc or
+// arbitrary SGPR.
+def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
+ // We use VCSrc_32 to exclude literal constants, even though the
+ // encoding normally allows them since the implicit VCC use means
+ // using one would always violate the constant bus
+ // restriction. SGPRs are still allowed because it should
+ // technically be possible to use VCC again as src0.
let Src0RC32 = VCSrc_32;
+ let Asm32 = "$dst, vcc, $src0, $src1, vcc";
+ let Asm64 = "$dst, $sdst, $src0, $src1, $src2";
+ let Outs32 = (outs DstRC:$dst);
+ let Outs64 = (outs DstRC:$dst, SReg_64:$sdst);
+
+ // Suppress src2 implied by type since the 32-bit encoding uses an
+ // implicit VCC use.
+ let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
}
-def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> {
- let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
- let Asm64 = "$dst, $src0_modifiers, $src1";
+class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
+ let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+ let Asm64 = "$vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod";
+}
+
+def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> {
+ // FIXME: Hack to stop printing _e64
+ let DstRC = RegisterOperand<VGPR_32>;
}
-def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> {
+def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> {
+ // FIXME: Hack to stop printing _e64
+ let DstRC = RegisterOperand<VReg_64>;
+}
+
+// VOPC instructions are a special case because for the 32-bit
+// encoding, we want to display the implicit vcc write as if it were
+// an explicit $dst.
+class VOPC_Profile<ValueType vt0, ValueType vt1 = vt0> : VOPProfile <[i1, vt0, vt1, untyped]> {
+ let Asm32 = "vcc, $src0, $src1";
+ // The destination for 32-bit encoding is implicit.
+ let HasDst32 = 0;
+}
+
+class VOPC_Class_Profile<ValueType vt> : VOPC_Profile<vt, i32> {
let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
let Asm64 = "$dst, $src0_modifiers, $src1";
}
+def VOPC_I1_F32_F32 : VOPC_Profile<f32>;
+def VOPC_I1_F64_F64 : VOPC_Profile<f64>;
+def VOPC_I1_I32_I32 : VOPC_Profile<i32>;
+def VOPC_I1_I64_I64 : VOPC_Profile<i64>;
+
+def VOPC_I1_F32_I32 : VOPC_Class_Profile<f32>;
+def VOPC_I1_F64_I32 : VOPC_Class_Profile<f64>;
+
def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>;
def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> {
- let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VCCReg:$src2);
+ let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2);
let Asm64 = "$dst, $src0, $src1, $src2";
}
@@ -1119,13 +1268,60 @@ def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> {
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
HasModifiers>.ret;
- let Asm32 = getAsm32<2>.ret;
- let Asm64 = getAsm64<2, HasModifiers>.ret;
+ let Asm32 = getAsm32<1, 2>.ret;
+ let Asm64 = getAsm64<1, 2, HasModifiers>.ret;
}
def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
+class SIInstAlias <string asm, Instruction inst, VOPProfile p> :
+ InstAlias <asm, (inst)>, PredicateControl {
+
+ field bit isCompare;
+ field bit isCommutable;
+
+ let ResultInst =
+ !if (p.HasDst32,
+ !if (!eq(p.NumSrcArgs, 0),
+ // 1 dst, 0 src
+ (inst p.DstRC:$dst),
+ !if (!eq(p.NumSrcArgs, 1),
+ // 1 dst, 1 src
+ (inst p.DstRC:$dst, p.Src0RC32:$src0),
+ !if (!eq(p.NumSrcArgs, 2),
+ // 1 dst, 2 src
+ (inst p.DstRC:$dst, p.Src0RC32:$src0, p.Src1RC32:$src1),
+ // else - unreachable
+ (inst)))),
+ // else
+ !if (!eq(p.NumSrcArgs, 2),
+ // 0 dst, 2 src
+ (inst p.Src0RC32:$src0, p.Src1RC32:$src1),
+ !if (!eq(p.NumSrcArgs, 1),
+ // 0 dst, 1 src
+ (inst p.Src0RC32:$src1),
+ // else
+ // 0 dst, 0 src
+ (inst))));
+}
+
+class SIInstAliasSI <string asm, string op_name, VOPProfile p> :
+ SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_si"), p> {
+ let AssemblerPredicate = SIAssemblerPredicate;
+}
+
+class SIInstAliasVI <string asm, string op_name, VOPProfile p> :
+ SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_vi"), p> {
+ let AssemblerPredicates = [isVI];
+}
+
+multiclass SIInstAliasBuilder <string asm, VOPProfile p> {
+
+ def : SIInstAliasSI <asm, NAME, p>;
+
+ def : SIInstAliasVI <asm, NAME, p>;
+}
class VOP <string opName> {
string OpName = opName;
@@ -1165,20 +1361,22 @@ class VOP1_Real_vi <string opName, vop1 op, dag outs, dag ins, string asm> :
let AssemblerPredicates = [isVI];
}
-multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
- string opName> {
- def "" : VOP1_Pseudo <outs, ins, pattern, opName>;
+multiclass VOP1_m <vop1 op, string opName, VOPProfile p, list<dag> pattern,
+ string asm = opName#p.Asm32> {
+ def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>;
- def _si : VOP1_Real_si <opName, op, outs, ins, asm>;
+ def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>;
+
+ def _vi : VOP1_Real_vi <opName, op, p.Outs, p.Ins32, asm>;
- def _vi : VOP1_Real_vi <opName, op, outs, ins, asm>;
}
-multiclass VOP1SI_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
- string opName> {
- def "" : VOP1_Pseudo <outs, ins, pattern, opName>;
+multiclass VOP1SI_m <vop1 op, string opName, VOPProfile p, list<dag> pattern,
+ string asm = opName#p.Asm32> {
+
+ def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>;
- def _si : VOP1_Real_si <opName, op, outs, ins, asm>;
+ def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>;
}
class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
@@ -1202,22 +1400,24 @@ class VOP2_Real_vi <string opName, vop2 op, dag outs, dag ins, string asm> :
let AssemblerPredicates = [isVI];
}
-multiclass VOP2SI_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern,
- string opName, string revOp> {
- def "" : VOP2_Pseudo <outs, ins, pattern, opName>,
+multiclass VOP2SI_m <vop2 op, string opName, VOPProfile p, list<dag> pattern,
+ string revOp> {
+
+ def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>,
VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
- def _si : VOP2_Real_si <opName, op, outs, ins, asm>;
+ def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>;
}
-multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern,
- string opName, string revOp> {
- def "" : VOP2_Pseudo <outs, ins, pattern, opName>,
+multiclass VOP2_m <vop2 op, string opName, VOPProfile p, list <dag> pattern,
+ string revOp> {
+
+ def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>,
VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
- def _si : VOP2_Real_si <opName, op, outs, ins, asm>;
+ def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>;
- def _vi : VOP2_Real_vi <opName, op, outs, ins, asm>;
+ def _vi : VOP2_Real_vi <opName, op, p.Outs32, p.Ins32, p.Asm32>;
}
@@ -1250,6 +1450,9 @@ class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
MnemonicAlias<opName#"_e64", opName> {
let isPseudo = 1;
let isCodeGenOnly = 1;
+
+ field bit vdst;
+ field bit src0;
}
class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> :
@@ -1295,22 +1498,6 @@ multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern,
HasMods>;
}
-// VOP3_m without source modifiers
-multiclass VOP3_m_nomods <vop op, dag outs, dag ins, string asm, list<dag> pattern,
- string opName, int NumSrcArgs, bit HasMods = 1> {
-
- def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
-
- let src0_modifiers = 0,
- src1_modifiers = 0,
- src2_modifiers = 0,
- clamp = 0,
- omod = 0 in {
- def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>;
- def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>;
- }
-}
-
multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm,
list<dag> pattern, string opName, bit HasMods = 1> {
@@ -1335,7 +1522,7 @@ multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm,
multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm,
list<dag> pattern, string opName, string revOp,
- bit HasMods = 1, bit UseFullOp = 0> {
+ bit HasMods = 1> {
def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
@@ -1349,7 +1536,7 @@ multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm,
multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm,
list<dag> pattern, string opName, string revOp,
- bit HasMods = 1, bit UseFullOp = 0> {
+ bit HasMods = 1> {
def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
@@ -1360,54 +1547,41 @@ multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm,
// No VI instruction. This class is for SI only.
}
-// XXX - Is v_div_scale_{f32|f64} only available in vop3b without
-// option of implicit vcc use?
-multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm,
- list<dag> pattern, string opName, string revOp,
- bit HasMods = 1, bit UseFullOp = 0> {
- def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
- VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
-
- // The VOP2 variant puts the carry out into VCC, the VOP3 variant
- // can write it into any SGPR. We currently don't use the carry out,
- // so for now hardcode it to VCC as well.
- let sdst = SIOperand.VCC, Defs = [VCC] in {
- def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>,
- VOP3DisableFields<1, 0, HasMods>;
-
- def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>,
- VOP3DisableFields<1, 0, HasMods>;
- } // End sdst = SIOperand.VCC, Defs = [VCC]
-}
-
-multiclass VOP3b_3_m <vop op, dag outs, dag ins, string asm,
- list<dag> pattern, string opName, string revOp,
- bit HasMods = 1, bit UseFullOp = 0> {
+// Two operand VOP3b instruction that may have a 3rd SGPR bool operand
+// instead of an implicit VCC as in the VOP2b format.
+multiclass VOP3b_2_3_m <vop op, dag outs, dag ins, string asm,
+ list<dag> pattern, string opName, string revOp,
+ bit HasMods = 1, bit useSrc2Input = 0> {
def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
-
def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>,
- VOP3DisableFields<1, 1, HasMods>;
+ VOP3DisableFields<1, useSrc2Input, HasMods>;
def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>,
- VOP3DisableFields<1, 1, HasMods>;
+ VOP3DisableFields<1, useSrc2Input, HasMods>;
}
multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm,
list<dag> pattern, string opName,
- bit HasMods, bit defExec, string revOp> {
+ bit HasMods, bit defExec,
+ string revOp, list<SchedReadWrite> sched> {
def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
- VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
+ VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
+ let Defs = !if(defExec, [EXEC], []);
+ let SchedRW = sched;
+ }
def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
VOP3DisableFields<1, 0, HasMods> {
let Defs = !if(defExec, [EXEC], []);
+ let SchedRW = sched;
}
def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>,
VOP3DisableFields<1, 0, HasMods> {
let Defs = !if(defExec, [EXEC], []);
+ let SchedRW = sched;
}
}
@@ -1432,32 +1606,28 @@ multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins,
}
}
-multiclass VOP1_Helper <vop1 op, string opName, dag outs,
- dag ins32, string asm32, list<dag> pat32,
- dag ins64, string asm64, list<dag> pat64,
- bit HasMods> {
+multiclass VOP1_Helper <vop1 op, string opName, VOPProfile p, list<dag> pat32,
+ list<dag> pat64> {
- defm _e32 : VOP1_m <op, outs, ins32, opName#asm32, pat32, opName>;
+ defm _e32 : VOP1_m <op, opName, p, pat32>;
- defm _e64 : VOP3_1_m <op, outs, ins64, opName#asm64, pat64, opName, HasMods>;
+ defm _e64 : VOP3_1_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName,
+ p.HasModifiers>;
}
multiclass VOP1Inst <vop1 op, string opName, VOPProfile P,
SDPatternOperator node = null_frag> : VOP1_Helper <
- op, opName, P.Outs,
- P.Ins32, P.Asm32, [],
- P.Ins64, P.Asm64,
+ op, opName, P, [],
!if(P.HasModifiers,
[(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
- [(set P.DstVT:$dst, (node P.Src0VT:$src0))]),
- P.HasModifiers
+ [(set P.DstVT:$dst, (node P.Src0VT:$src0))])
>;
multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P,
SDPatternOperator node = null_frag> {
- defm _e32 : VOP1SI_m <op, P.Outs, P.Ins32, opName#P.Asm32, [], opName>;
+ defm _e32 : VOP1SI_m <op, opName, P, []>;
defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64,
!if(P.HasModifiers,
@@ -1467,36 +1637,33 @@ multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P,
opName, P.HasModifiers>;
}
-multiclass VOP2_Helper <vop2 op, string opName, dag outs,
- dag ins32, string asm32, list<dag> pat32,
- dag ins64, string asm64, list<dag> pat64,
- string revOp, bit HasMods> {
- defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>;
+multiclass VOP2_Helper <vop2 op, string opName, VOPProfile p, list<dag> pat32,
+ list<dag> pat64, string revOp> {
- defm _e64 : VOP3_2_m <op,
- outs, ins64, opName#asm64, pat64, opName, revOp, HasMods
- >;
+ defm _e32 : VOP2_m <op, opName, p, pat32, revOp>;
+
+ defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName,
+ revOp, p.HasModifiers>;
}
multiclass VOP2Inst <vop2 op, string opName, VOPProfile P,
SDPatternOperator node = null_frag,
string revOp = opName> : VOP2_Helper <
- op, opName, P.Outs,
- P.Ins32, P.Asm32, [],
- P.Ins64, P.Asm64,
+ op, opName, P, [],
!if(P.HasModifiers,
[(set P.DstVT:$dst,
(node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
i1:$clamp, i32:$omod)),
(P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
[(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
- revOp, P.HasModifiers
+ revOp
>;
multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P,
SDPatternOperator node = null_frag,
string revOp = opName> {
- defm _e32 : VOP2SI_m <op, P.Outs, P.Ins32, P.Asm32, [], opName, revOp>;
+
+ defm _e32 : VOP2SI_m <op, opName, P, [], revOp>;
defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#P.Asm64,
!if(P.HasModifiers,
@@ -1508,58 +1675,55 @@ multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P,
opName, revOp, P.HasModifiers>;
}
-multiclass VOP2b_Helper <vop2 op, string opName, dag outs,
- dag ins32, string asm32, list<dag> pat32,
- dag ins64, string asm64, list<dag> pat64,
- string revOp, bit HasMods> {
+multiclass VOP2b_Helper <vop2 op, string opName, VOPProfile p,
+ list<dag> pat32, list<dag> pat64,
+ string revOp, bit useSGPRInput> {
- defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>;
+ let SchedRW = [Write32Bit, WriteSALU] in {
+ let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
+ defm _e32 : VOP2_m <op, opName, p, pat32, revOp>;
+ }
- defm _e64 : VOP3b_2_m <op,
- outs, ins64, opName#asm64, pat64, opName, revOp, HasMods
- >;
+ defm _e64 : VOP3b_2_3_m <op, p.Outs64, p.Ins64, opName#p.Asm64, pat64,
+ opName, revOp, p.HasModifiers, useSGPRInput>;
+ }
}
multiclass VOP2bInst <vop2 op, string opName, VOPProfile P,
SDPatternOperator node = null_frag,
string revOp = opName> : VOP2b_Helper <
- op, opName, P.Outs,
- P.Ins32, P.Asm32, [],
- P.Ins64, P.Asm64,
+ op, opName, P, [],
!if(P.HasModifiers,
[(set P.DstVT:$dst,
(node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
i1:$clamp, i32:$omod)),
(P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
[(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
- revOp, P.HasModifiers
+ revOp, !eq(P.NumSrcArgs, 3)
>;
// A VOP2 instruction that is VOP3-only on VI.
-multiclass VOP2_VI3_Helper <vop23 op, string opName, dag outs,
- dag ins32, string asm32, list<dag> pat32,
- dag ins64, string asm64, list<dag> pat64,
- string revOp, bit HasMods> {
- defm _e32 : VOP2SI_m <op, outs, ins32, asm32, pat32, opName, revOp>;
+multiclass VOP2_VI3_Helper <vop23 op, string opName, VOPProfile p,
+ list<dag> pat32, list<dag> pat64, string revOp> {
- defm _e64 : VOP3_2_m <op, outs, ins64, opName#asm64, pat64, opName,
- revOp, HasMods>;
+ defm _e32 : VOP2SI_m <op, opName, p, pat32, revOp>;
+
+ defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName,
+ revOp, p.HasModifiers>;
}
multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P,
SDPatternOperator node = null_frag,
string revOp = opName>
: VOP2_VI3_Helper <
- op, opName, P.Outs,
- P.Ins32, P.Asm32, [],
- P.Ins64, P.Asm64,
+ op, opName, P, [],
!if(P.HasModifiers,
[(set P.DstVT:$dst,
(node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
i1:$clamp, i32:$omod)),
(P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
[(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
- revOp, P.HasModifiers
+ revOp
>;
multiclass VOP2MADK <vop2 op, string opName, list<dag> pattern = []> {
@@ -1583,64 +1747,75 @@ let isCodeGenOnly = 0 in {
} // End isCodeGenOnly = 0
}
-class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
+class VOPC_Pseudo <dag ins, list<dag> pattern, string opName> :
VOPCCommon <ins, "", pattern>,
VOP <opName>,
- SIMCInstr<opName#"_e32", SISubtarget.NONE>,
- MnemonicAlias<opName#"_e32", opName> {
+ SIMCInstr<opName#"_e32", SISubtarget.NONE> {
let isPseudo = 1;
let isCodeGenOnly = 1;
}
-multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern,
- string opName, bit DefExec, string revOpName = ""> {
- def "" : VOPC_Pseudo <outs, ins, pattern, opName>;
-
- def _si : VOPC<op.SI, ins, asm, []>,
- SIMCInstr <opName#"_e32", SISubtarget.SI> {
- let Defs = !if(DefExec, [EXEC], []);
- let hasSideEffects = DefExec;
- let AssemblerPredicates = [isSICI];
+multiclass VOPC_m <vopc op, dag ins, string op_asm, list<dag> pattern,
+ string opName, bit DefExec, VOPProfile p,
+ list<SchedReadWrite> sched,
+ string revOpName = "", string asm = opName#"_e32 "#op_asm,
+ string alias_asm = opName#" "#op_asm> {
+ def "" : VOPC_Pseudo <ins, pattern, opName> {
+ let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let SchedRW = sched;
}
- def _vi : VOPC<op.VI, ins, asm, []>,
- SIMCInstr <opName#"_e32", SISubtarget.VI> {
- let Defs = !if(DefExec, [EXEC], []);
- let hasSideEffects = DefExec;
- let AssemblerPredicates = [isVI];
- }
+ let AssemblerPredicates = [isSICI] in {
+ def _si : VOPC<op.SI, ins, asm, []>,
+ SIMCInstr <opName#"_e32", SISubtarget.SI> {
+ let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let hasSideEffects = DefExec;
+ let SchedRW = sched;
+ }
+
+ } // End AssemblerPredicates = [isSICI]
+
+ let AssemblerPredicates = [isVI] in {
+ def _vi : VOPC<op.VI, ins, asm, []>,
+ SIMCInstr <opName#"_e32", SISubtarget.VI> {
+ let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+ let hasSideEffects = DefExec;
+ let SchedRW = sched;
+ }
+
+ } // End AssemblerPredicates = [isVI]
+
+ defm : SIInstAliasBuilder<alias_asm, p>;
}
-multiclass VOPC_Helper <vopc op, string opName,
- dag ins32, string asm32, list<dag> pat32,
- dag out64, dag ins64, string asm64, list<dag> pat64,
- bit HasMods, bit DefExec, string revOp> {
- defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>;
+multiclass VOPC_Helper <vopc op, string opName, list<dag> pat32,
+ list<dag> pat64, bit DefExec, string revOp,
+ VOPProfile p, list<SchedReadWrite> sched> {
+ defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>;
- defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64,
- opName, HasMods, DefExec, revOp>;
+ defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64,
+ opName, p.HasModifiers, DefExec, revOp, sched>;
}
// Special case for class instructions which only have modifiers on
// the 1st source operand.
-multiclass VOPC_Class_Helper <vopc op, string opName,
- dag ins32, string asm32, list<dag> pat32,
- dag out64, dag ins64, string asm64, list<dag> pat64,
- bit HasMods, bit DefExec, string revOp> {
- defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>;
-
- defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64,
- opName, HasMods, DefExec, revOp>,
+multiclass VOPC_Class_Helper <vopc op, string opName, list<dag> pat32,
+ list<dag> pat64, bit DefExec, string revOp,
+ VOPProfile p, list<SchedReadWrite> sched> {
+ defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>;
+
+ defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64,
+ opName, p.HasModifiers, DefExec, revOp, sched>,
VOP3DisableModFields<1, 0, 0>;
}
multiclass VOPCInst <vopc op, string opName,
VOPProfile P, PatLeaf cond = COND_NULL,
string revOp = opName,
- bit DefExec = 0> : VOPC_Helper <
- op, opName,
- P.Ins32, P.Asm32, [],
- (outs VOPDstS64:$dst), P.Ins64, P.Asm64,
+ bit DefExec = 0,
+ list<SchedReadWrite> sched = [Write32Bit]> :
+ VOPC_Helper <
+ op, opName, [],
!if(P.HasModifiers,
[(set i1:$dst,
(setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
@@ -1648,51 +1823,51 @@ multiclass VOPCInst <vopc op, string opName,
(P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
cond))],
[(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]),
- P.HasModifiers, DefExec, revOp
+ DefExec, revOp, P, sched
>;
multiclass VOPCClassInst <vopc op, string opName, VOPProfile P,
- bit DefExec = 0> : VOPC_Class_Helper <
- op, opName,
- P.Ins32, P.Asm32, [],
- (outs VOPDstS64:$dst), P.Ins64, P.Asm64,
+ bit DefExec = 0,
+ list<SchedReadWrite> sched> : VOPC_Class_Helper <
+ op, opName, [],
!if(P.HasModifiers,
[(set i1:$dst,
(AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))],
[(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]),
- P.HasModifiers, DefExec, opName
+ DefExec, opName, P, sched
>;
multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
- VOPCInst <op, opName, VOP_F32_F32_F32, cond, revOp>;
+ VOPCInst <op, opName, VOPC_I1_F32_F32, cond, revOp>;
multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
- VOPCInst <op, opName, VOP_F64_F64_F64, cond, revOp>;
+ VOPCInst <op, opName, VOPC_I1_F64_F64, cond, revOp, 0, [WriteDoubleAdd]>;
multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
- VOPCInst <op, opName, VOP_I32_I32_I32, cond, revOp>;
+ VOPCInst <op, opName, VOPC_I1_I32_I32, cond, revOp>;
multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
- VOPCInst <op, opName, VOP_I64_I64_I64, cond, revOp>;
+ VOPCInst <op, opName, VOPC_I1_I64_I64, cond, revOp, 0, [Write64Bit]>;
multiclass VOPCX <vopc op, string opName, VOPProfile P,
PatLeaf cond = COND_NULL,
+ list<SchedReadWrite> sched,
string revOp = "">
- : VOPCInst <op, opName, P, cond, revOp, 1>;
+ : VOPCInst <op, opName, P, cond, revOp, 1, sched>;
multiclass VOPCX_F32 <vopc op, string opName, string revOp = opName> :
- VOPCX <op, opName, VOP_F32_F32_F32, COND_NULL, revOp>;
+ VOPCX <op, opName, VOPC_I1_F32_F32, COND_NULL, [Write32Bit], revOp>;
multiclass VOPCX_F64 <vopc op, string opName, string revOp = opName> :
- VOPCX <op, opName, VOP_F64_F64_F64, COND_NULL, revOp>;
+ VOPCX <op, opName, VOPC_I1_F64_F64, COND_NULL, [WriteDoubleAdd], revOp>;
multiclass VOPCX_I32 <vopc op, string opName, string revOp = opName> :
- VOPCX <op, opName, VOP_I32_I32_I32, COND_NULL, revOp>;
+ VOPCX <op, opName, VOPC_I1_I32_I32, COND_NULL, [Write32Bit], revOp>;
multiclass VOPCX_I64 <vopc op, string opName, string revOp = opName> :
- VOPCX <op, opName, VOP_I64_I64_I64, COND_NULL, revOp>;
+ VOPCX <op, opName, VOPC_I1_I64_I64, COND_NULL, [Write64Bit], revOp>;
multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm,
list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m <
@@ -1700,16 +1875,16 @@ multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm,
>;
multiclass VOPC_CLASS_F32 <vopc op, string opName> :
- VOPCClassInst <op, opName, VOP_I1_F32_I32, 0>;
+ VOPCClassInst <op, opName, VOPC_I1_F32_I32, 0, [Write32Bit]>;
multiclass VOPCX_CLASS_F32 <vopc op, string opName> :
- VOPCClassInst <op, opName, VOP_I1_F32_I32, 1>;
+ VOPCClassInst <op, opName, VOPC_I1_F32_I32, 1, [Write32Bit]>;
multiclass VOPC_CLASS_F64 <vopc op, string opName> :
- VOPCClassInst <op, opName, VOP_I1_F64_I32, 0>;
+ VOPCClassInst <op, opName, VOPC_I1_F64_I32, 0, [WriteDoubleAdd]>;
multiclass VOPCX_CLASS_F64 <vopc op, string opName> :
- VOPCClassInst <op, opName, VOP_I1_F64_I32, 1>;
+ VOPCClassInst <op, opName, VOPC_I1_F64_I32, 1, [WriteDoubleAdd]>;
multiclass VOP3Inst <vop3 op, string opName, VOPProfile P,
SDPatternOperator node = null_frag> : VOP3_Helper <
@@ -1761,25 +1936,13 @@ multiclass VOP3_VCC_Inst <vop3 op, string opName,
3, 1
>;
-multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterOperand arc,
- string opName, list<dag> pattern> :
- VOP3b_3_m <
- op, (outs vrc:$vdst, SReg_64:$sdst),
- (ins InputModsNoDefault:$src0_modifiers, arc:$src0,
- InputModsNoDefault:$src1_modifiers, arc:$src1,
- InputModsNoDefault:$src2_modifiers, arc:$src2,
- ClampMod:$clamp, omod:$omod),
- opName#" $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", pattern,
- opName, opName, 1, 1
+multiclass VOP3bInst <vop op, string opName, VOPProfile P, list<dag> pattern = []> :
+ VOP3b_2_3_m <
+ op, P.Outs64, P.Ins64,
+ opName#" "#P.Asm64, pattern,
+ opName, "", 1, 1
>;
-multiclass VOP3b_64 <vop3 op, string opName, list<dag> pattern> :
- VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>;
-
-multiclass VOP3b_32 <vop3 op, string opName, list<dag> pattern> :
- VOP3b_Helper <op, VGPR_32, VSrc_32, opName, pattern>;
-
-
class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat<
(node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
(P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
@@ -1925,12 +2088,14 @@ multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc,
dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds),
string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> {
- def "" : DS_Pseudo <opName, outs, ins, []>,
- AtomicNoRet<noRetOp, 1>;
+ let hasPostISelHook = 1 in {
+ def "" : DS_Pseudo <opName, outs, ins, []>,
+ AtomicNoRet<noRetOp, 1>;
- let data1 = 0 in {
- def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
- def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+ let data1 = 0 in {
+ def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
+ def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+ }
}
}
@@ -1939,11 +2104,13 @@ multiclass DS_1A2D_RET_m <bits<8> op, string opName, RegisterClass rc,
dag outs = (outs rc:$vdst),
string asm = opName#" $vdst, $addr, $data0, $data1"#"$offset"#"$gds"> {
- def "" : DS_Pseudo <opName, outs, ins, []>,
- AtomicNoRet<noRetOp, 1>;
+ let hasPostISelHook = 1 in {
+ def "" : DS_Pseudo <opName, outs, ins, []>,
+ AtomicNoRet<noRetOp, 1>;
- def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
- def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+ def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>;
+ def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>;
+ }
}
multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc,
@@ -2214,7 +2381,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
defm _ADDR64 : MUBUFAtomicAddr64_m <
op, name#"_addr64", (outs),
- (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
+ (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc,
SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc),
name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0
>;
@@ -2233,7 +2400,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
defm _RTN_ADDR64 : MUBUFAtomicAddr64_m <
op, name#"_rtn_addr64", (outs rc:$vdata),
- (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr,
+ (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc,
SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc),
name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc",
[(set vt:$vdata,
@@ -2245,7 +2412,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
op, name#"_rtn_offset", (outs rc:$vdata),
(ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset,
mbuf_offset:$offset, slc:$slc),
- name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc",
+ name#" $vdata, $srsrc, $soffset"#"$offset"#" glc$slc",
[(set vt:$vdata,
(atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset,
i1:$slc), vt:$vdata_in))], 1
@@ -2256,6 +2423,8 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
} // mayStore = 1, mayLoad = 1, hasPostISelHook = 1
}
+// FIXME: tfe can't be an operand because it requires a separate
+// opcode because it needs an N+1 register class dest register.
multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass,
ValueType load_vt = i32,
SDPatternOperator ld = null_frag> {
@@ -2368,47 +2537,121 @@ multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass,
} // End mayLoad = 0, mayStore = 1
}
-class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> :
- FLAT <op, (outs regClass:$vdst),
- (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
- asm#" $vdst, $addr"#"$glc"#"$slc"#"$tfe", []> {
- let data = 0;
- let mayLoad = 1;
+// For cache invalidation instructions.
+multiclass MUBUF_Invalidate <mubuf op, string opName, SDPatternOperator node> {
+ let hasSideEffects = 1, mayStore = 1, AsmMatchConverter = "" in {
+ def "" : MUBUF_Pseudo <opName, (outs), (ins), [(node)]>;
+
+ // Set everything to 0.
+ let offset = 0, offen = 0, idxen = 0, glc = 0, vaddr = 0,
+ vdata = 0, srsrc = 0, slc = 0, tfe = 0, soffset = 0 in {
+ let addr64 = 0 in {
+ def _si : MUBUF_Real_si <op, opName, (outs), (ins), opName>;
+ }
+
+ def _vi : MUBUF_Real_vi <op, opName, (outs), (ins), opName>;
+ }
+ } // End hasSideEffects = 1, mayStore = 1, AsmMatchConverter = ""
}
-class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
- FLAT <op, (outs), (ins vdataClass:$data, VReg_64:$addr,
- glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
- name#" $data, $addr"#"$glc"#"$slc"#"$tfe",
- []> {
+//===----------------------------------------------------------------------===//
+// FLAT classes
+//===----------------------------------------------------------------------===//
+
+class flat <bits<7> ci, bits<7> vi = ci> {
+ field bits<7> CI = ci;
+ field bits<7> VI = vi;
+}
- let mayLoad = 0;
- let mayStore = 1;
+class FLAT_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+ FLAT <0, outs, ins, "", pattern>,
+ SIMCInstr<opName, SISubtarget.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+}
- // Encoding
- let vdst = 0;
+class FLAT_Real_ci <bits<7> op, string opName, dag outs, dag ins, string asm> :
+ FLAT <op, outs, ins, asm, []>,
+ SIMCInstr<opName, SISubtarget.SI> {
+ let AssemblerPredicate = isCIOnly;
}
-multiclass FLAT_ATOMIC <bits<7> op, string name, RegisterClass vdst_rc,
- RegisterClass data_rc = vdst_rc> {
+class FLAT_Real_vi <bits<7> op, string opName, dag outs, dag ins, string asm> :
+ FLAT <op, outs, ins, asm, []>,
+ SIMCInstr<opName, SISubtarget.VI> {
+ let AssemblerPredicate = VIAssemblerPredicate;
+}
- let mayLoad = 1, mayStore = 1 in {
- def "" : FLAT <op, (outs),
- (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
- tfe_flat_atomic:$tfe),
- name#" $addr, $data"#"$slc"#"$tfe", []>,
- AtomicNoRet <NAME, 0> {
- let glc = 0;
- let vdst = 0;
- }
+multiclass FLAT_AtomicRet_m <flat op, dag outs, dag ins, string asm,
+ list<dag> pattern> {
+ def "" : FLAT_Pseudo <NAME#"_RTN", outs, ins, pattern>,
+ AtomicNoRet <NAME, 1>;
- def _RTN : FLAT <op, (outs vdst_rc:$vdst),
- (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
- tfe_flat_atomic:$tfe),
- name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>,
- AtomicNoRet <NAME, 1> {
- let glc = 1;
- }
+ def _ci : FLAT_Real_ci <op.CI, NAME#"_RTN", outs, ins, asm>;
+
+ def _vi : FLAT_Real_vi <op.VI, NAME#"_RTN", outs, ins, asm>;
+}
+
+multiclass FLAT_Load_Helper <flat op, string asm_name,
+ RegisterClass regClass,
+ dag outs = (outs regClass:$vdst),
+ dag ins = (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
+ string asm = asm_name#" $vdst, $addr"#"$glc"#"$slc"#"$tfe"> {
+
+ let data = 0, mayLoad = 1 in {
+
+ def "" : FLAT_Pseudo <NAME, outs, ins, []>;
+
+ def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>;
+
+ def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>;
+ }
+}
+
+multiclass FLAT_Store_Helper <flat op, string asm_name,
+ RegisterClass vdataClass,
+ dag outs = (outs),
+ dag ins = (ins vdataClass:$data, VReg_64:$addr, glc_flat:$glc,
+ slc_flat:$slc, tfe_flat:$tfe),
+ string asm = asm_name#" $data, $addr"#"$glc"#"$slc"#"$tfe"> {
+
+ let mayLoad = 0, mayStore = 1, vdst = 0 in {
+
+ def "" : FLAT_Pseudo <NAME, outs, ins, []>;
+
+ def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>;
+
+ def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>;
+ }
+}
+
+multiclass FLAT_ATOMIC <flat op, string asm_name, RegisterClass vdst_rc,
+ RegisterClass data_rc = vdst_rc,
+ dag outs_noret = (outs),
+ string asm_noret = asm_name#" $addr, $data"#"$slc"#"$tfe"> {
+
+ let mayLoad = 1, mayStore = 1, glc = 0, vdst = 0 in {
+ def "" : FLAT_Pseudo <NAME, outs_noret,
+ (ins VReg_64:$addr, data_rc:$data,
+ slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), []>,
+ AtomicNoRet <NAME, 0>;
+
+ def _ci : FLAT_Real_ci <op.CI, NAME, outs_noret,
+ (ins VReg_64:$addr, data_rc:$data,
+ slc_flat_atomic:$slc, tfe_flat_atomic:$tfe),
+ asm_noret>;
+
+ def _vi : FLAT_Real_vi <op.VI, NAME, outs_noret,
+ (ins VReg_64:$addr, data_rc:$data,
+ slc_flat_atomic:$slc, tfe_flat_atomic:$tfe),
+ asm_noret>;
+ }
+
+ let glc = 1, hasPostISelHook = 1 in {
+ defm _RTN : FLAT_AtomicRet_m <op, (outs vdst_rc:$vdst),
+ (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
+ tfe_flat_atomic:$tfe),
+ asm_name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>;
}
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
index e0eeea9..6f653c7 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -30,7 +30,9 @@ def isGCN : Predicate<"Subtarget->getGeneration() "
">= AMDGPUSubtarget::SOUTHERN_ISLANDS">,
AssemblerPredicate<"FeatureGCN">;
def isSI : Predicate<"Subtarget->getGeneration() "
- "== AMDGPUSubtarget::SOUTHERN_ISLANDS">;
+ "== AMDGPUSubtarget::SOUTHERN_ISLANDS">,
+ AssemblerPredicate<"FeatureSouthernIslands">;
+
def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
@@ -62,36 +64,38 @@ let mayLoad = 1 in {
// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
// SMRD instructions, because the SGPR_32 register class does not include M0
// and writing to M0 from an SMRD instruction will hang the GPU.
-defm S_LOAD_DWORD : SMRD_Helper <0x00, "s_load_dword", SReg_64, SGPR_32>;
-defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "s_load_dwordx2", SReg_64, SReg_64>;
-defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "s_load_dwordx4", SReg_64, SReg_128>;
-defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "s_load_dwordx8", SReg_64, SReg_256>;
-defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "s_load_dwordx16", SReg_64, SReg_512>;
+defm S_LOAD_DWORD : SMRD_Helper <smrd<0x00>, "s_load_dword", SReg_64, SGPR_32>;
+defm S_LOAD_DWORDX2 : SMRD_Helper <smrd<0x01>, "s_load_dwordx2", SReg_64, SReg_64>;
+defm S_LOAD_DWORDX4 : SMRD_Helper <smrd<0x02>, "s_load_dwordx4", SReg_64, SReg_128>;
+defm S_LOAD_DWORDX8 : SMRD_Helper <smrd<0x03>, "s_load_dwordx8", SReg_64, SReg_256>;
+defm S_LOAD_DWORDX16 : SMRD_Helper <smrd<0x04>, "s_load_dwordx16", SReg_64, SReg_512>;
defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
- 0x08, "s_buffer_load_dword", SReg_128, SGPR_32
+ smrd<0x08>, "s_buffer_load_dword", SReg_128, SGPR_32
>;
defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
- 0x09, "s_buffer_load_dwordx2", SReg_128, SReg_64
+ smrd<0x09>, "s_buffer_load_dwordx2", SReg_128, SReg_64
>;
defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
- 0x0a, "s_buffer_load_dwordx4", SReg_128, SReg_128
+ smrd<0x0a>, "s_buffer_load_dwordx4", SReg_128, SReg_128
>;
defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
- 0x0b, "s_buffer_load_dwordx8", SReg_128, SReg_256
+ smrd<0x0b>, "s_buffer_load_dwordx8", SReg_128, SReg_256
>;
defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
- 0x0c, "s_buffer_load_dwordx16", SReg_128, SReg_512
+ smrd<0x0c>, "s_buffer_load_dwordx16", SReg_128, SReg_512
>;
} // mayLoad = 1
//def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>;
-//def S_DCACHE_INV : SMRD_ <0x0000001f, "s_dcache_inv", []>;
+
+defm S_DCACHE_INV : SMRD_Inval <smrd<0x1f, 0x20>, "s_dcache_inv",
+ int_amdgcn_s_dcache_inv>;
//===----------------------------------------------------------------------===//
// SOP1 Instructions
@@ -123,7 +127,7 @@ let Defs = [SCC] in {
defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32",
- [(set i32:$dst, (AMDGPUbrev i32:$src0))]
+ [(set i32:$dst, (bitreverse i32:$src0))]
>;
defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>;
@@ -183,10 +187,14 @@ defm S_XNOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2b, 0x27>, "s_xnor_saveexec_b64", []>
defm S_QUADMASK_B32 : SOP1_32 <sop1<0x2c, 0x28>, "s_quadmask_b32", []>;
defm S_QUADMASK_B64 : SOP1_64 <sop1<0x2d, 0x29>, "s_quadmask_b64", []>;
+
+let Uses = [M0] in {
defm S_MOVRELS_B32 : SOP1_32 <sop1<0x2e, 0x2a>, "s_movrels_b32", []>;
defm S_MOVRELS_B64 : SOP1_64 <sop1<0x2f, 0x2b>, "s_movrels_b64", []>;
defm S_MOVRELD_B32 : SOP1_32 <sop1<0x30, 0x2c>, "s_movreld_b32", []>;
defm S_MOVRELD_B64 : SOP1_64 <sop1<0x31, 0x2d>, "s_movreld_b64", []>;
+} // End Uses = [M0]
+
defm S_CBRANCH_JOIN : SOP1_1 <sop1<0x32, 0x2e>, "s_cbranch_join", []>;
defm S_MOV_REGRD_B32 : SOP1_32 <sop1<0x33, 0x2f>, "s_mov_regrd_b32", []>;
let Defs = [SCC] in {
@@ -354,7 +362,7 @@ def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">;
// SOPK Instructions
//===----------------------------------------------------------------------===//
-let isReMaterializable = 1 in {
+let isReMaterializable = 1, isMoveImm = 1 in {
defm S_MOVK_I32 : SOPK_32 <sopk<0x00>, "s_movk_i32", []>;
} // End isReMaterializable = 1
let Uses = [SCC] in {
@@ -438,36 +446,38 @@ def S_BRANCH : SOPP <
let isBarrier = 1;
}
-let DisableEncoding = "$scc" in {
+let Uses = [SCC] in {
def S_CBRANCH_SCC0 : SOPP <
- 0x00000004, (ins sopp_brtarget:$simm16, SCCReg:$scc),
+ 0x00000004, (ins sopp_brtarget:$simm16),
"s_cbranch_scc0 $simm16"
>;
def S_CBRANCH_SCC1 : SOPP <
- 0x00000005, (ins sopp_brtarget:$simm16, SCCReg:$scc),
+ 0x00000005, (ins sopp_brtarget:$simm16),
"s_cbranch_scc1 $simm16"
>;
-} // End DisableEncoding = "$scc"
+} // End Uses = [SCC]
+let Uses = [VCC] in {
def S_CBRANCH_VCCZ : SOPP <
- 0x00000006, (ins sopp_brtarget:$simm16, VCCReg:$vcc),
+ 0x00000006, (ins sopp_brtarget:$simm16),
"s_cbranch_vccz $simm16"
>;
def S_CBRANCH_VCCNZ : SOPP <
- 0x00000007, (ins sopp_brtarget:$simm16, VCCReg:$vcc),
+ 0x00000007, (ins sopp_brtarget:$simm16),
"s_cbranch_vccnz $simm16"
>;
+} // End Uses = [VCC]
-let DisableEncoding = "$exec" in {
+let Uses = [EXEC] in {
def S_CBRANCH_EXECZ : SOPP <
- 0x00000008, (ins sopp_brtarget:$simm16, EXECReg:$exec),
+ 0x00000008, (ins sopp_brtarget:$simm16),
"s_cbranch_execz $simm16"
>;
def S_CBRANCH_EXECNZ : SOPP <
- 0x00000009, (ins sopp_brtarget:$simm16, EXECReg:$exec),
+ 0x00000009, (ins sopp_brtarget:$simm16),
"s_cbranch_execnz $simm16"
>;
-} // End DisableEncoding = "$exec"
+} // End Uses = [EXEC]
} // End isBranch = 1
@@ -477,11 +487,11 @@ let hasSideEffects = 1 in {
def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
[(int_AMDGPU_barrier_local)]
> {
+ let SchedRW = [WriteBarrier];
let simm16 = 0;
- let isBarrier = 1;
- let hasCtrlDep = 1;
let mayLoad = 1;
let mayStore = 1;
+ let isConvergent = 1;
}
def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
@@ -805,9 +815,6 @@ defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmps
defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">;
defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">;
defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">;
-let SubtargetPredicate = isCI in {
-defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">;
-} // End isCI
defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>;
let mayStore = 0 in {
defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>;
@@ -905,11 +912,6 @@ defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">;
defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">;
defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">;
-//let SubtargetPredicate = isCI in {
-// DS_CONDXCHG32_RTN_B64
-// DS_CONDXCHG32_RTN_B128
-//} // End isCI
-
//===----------------------------------------------------------------------===//
// MUBUF Instructions
//===----------------------------------------------------------------------===//
@@ -951,13 +953,13 @@ defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <
mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global
>;
defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <
- mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load
+ mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, mubuf_load
>;
defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <
- mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load
+ mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load
>;
defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <
- mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load
+ mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load
>;
defm BUFFER_STORE_BYTE : MUBUF_Store_Helper <
@@ -1034,9 +1036,12 @@ defm BUFFER_ATOMIC_XOR : MUBUF_Atomic <
//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI
//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI
//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI
-//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <mubuf<0x70>, "buffer_wbinvl1_sc", []>; // isn't on CI & VI
-//def BUFFER_WBINVL1_VOL : MUBUF_WBINVL1 <mubuf<0x70, 0x3f>, "buffer_wbinvl1_vol", []>; // isn't on SI
-//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <mubuf<0x71, 0x3e>, "buffer_wbinvl1", []>;
+
+let SubtargetPredicate = isSI in {
+defm BUFFER_WBINVL1_SC : MUBUF_Invalidate <mubuf<0x70>, "buffer_wbinvl1_sc", int_amdgcn_buffer_wbinvl1_sc>; // isn't on CI & VI
+}
+
+defm BUFFER_WBINVL1 : MUBUF_Invalidate <mubuf<0x71, 0x3e>, "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1>;
//===----------------------------------------------------------------------===//
// MTBUF Instructions
@@ -1155,8 +1160,8 @@ defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o"
// VOP1 Instructions
//===----------------------------------------------------------------------===//
-let vdst = 0, src0 = 0 in {
-defm V_NOP : VOP1_m <vop1<0x0>, (outs), (ins), "v_nop", [], "v_nop">;
+let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in {
+defm V_NOP : VOP1Inst <vop1<0x0>, "v_nop", VOP_NONE>;
}
let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
@@ -1292,7 +1297,9 @@ defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64",
VOP_F64_F64, fsqrt
>;
-} // let SchedRW = [WriteDouble]
+} // End SchedRW = [WriteDouble]
+
+let SchedRW = [WriteQuarterRate32] in {
defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32",
VOP_F32_F32, AMDGPUsin
@@ -1300,6 +1307,9 @@ defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32",
defm V_COS_F32 : VOP1Inst <vop1<0x36, 0x2a>, "v_cos_f32",
VOP_F32_F32, AMDGPUcos
>;
+
+} // End SchedRW = [WriteQuarterRate32]
+
defm V_NOT_B32 : VOP1Inst <vop1<0x37, 0x2b>, "v_not_b32", VOP_I32_I32>;
defm V_BFREV_B32 : VOP1Inst <vop1<0x38, 0x2c>, "v_bfrev_b32", VOP_I32_I32>;
defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>;
@@ -1308,24 +1318,33 @@ defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>;
defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64",
VOP_I32_F64
>;
+
+let SchedRW = [WriteDoubleAdd] in {
defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64",
VOP_F64_F64
>;
-defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", VOP_F64_F64>;
+
+defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64",
+ VOP_F64_F64
+>;
+} // End SchedRW = [WriteDoubleAdd]
+
+
defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32",
VOP_I32_F32
>;
defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32",
VOP_F32_F32
>;
-let vdst = 0, src0 = 0 in {
-defm V_CLREXCP : VOP1_m <vop1<0x41,0x35>, (outs), (ins), "v_clrexcp", [],
- "v_clrexcp"
->;
+let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in {
+defm V_CLREXCP : VOP1Inst <vop1<0x41,0x35>, "v_clrexcp", VOP_NONE>;
}
+
+let Uses = [M0, EXEC] in {
defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>;
defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>;
defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>;
+} // End Uses = [M0, EXEC]
// These instruction only exist on SI and CI
let SubtargetPredicate = isSICI in {
@@ -1343,7 +1362,7 @@ defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32",
VOP_F32_F32, AMDGPUrsq_legacy
>;
-} // End let SchedRW = [WriteQuarterRate32]
+} // End SchedRW = [WriteQuarterRate32]
let SchedRW = [WriteDouble] in {
@@ -1360,7 +1379,7 @@ defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64",
// VINTRP Instructions
//===----------------------------------------------------------------------===//
-let Uses = [M0] in {
+let Uses = [M0, EXEC] in {
// FIXME: Specify SchedRW for VINTRP insturctions.
@@ -1405,16 +1424,14 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
[(set f32:$dst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan),
(i32 imm:$attr)))]>;
-} // End Uses = [M0]
+} // End Uses = [M0, EXEC]
//===----------------------------------------------------------------------===//
// VOP2 Instructions
//===----------------------------------------------------------------------===//
multiclass V_CNDMASK <vop2 op, string name> {
- defm _e32 : VOP2_m <
- op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins32, VOP_CNDMASK.Asm32, [],
- name, name>;
+ defm _e32 : VOP2_m <op, name, VOP_CNDMASK, [], name>;
defm _e64 : VOP3_m <
op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64,
@@ -1500,34 +1517,32 @@ let isCommutable = 1 in {
defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32">;
} // End isCommutable = 1
-let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
+let isCommutable = 1 in {
// No patterns so that the scalar instructions are always selected.
// The scalar versions will be replaced with vector when needed later.
// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI,
// but the VI instructions behave the same as the SI versions.
defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32",
- VOP_I32_I32_I32, add
+ VOP2b_I32_I1_I32_I32
>;
-defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP_I32_I32_I32>;
+defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP2b_I32_I1_I32_I32>;
defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32",
- VOP_I32_I32_I32, null_frag, "v_sub_i32"
+ VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32"
>;
-let Uses = [VCC] in { // Carry-in comes from VCC
defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32",
- VOP_I32_I32_I32_VCC
+ VOP2b_I32_I1_I32_I32_I1
>;
defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32",
- VOP_I32_I32_I32_VCC
+ VOP2b_I32_I1_I32_I32_I1
>;
defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32",
- VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32"
+ VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32"
>;
-} // End Uses = [VCC]
-} // End isCommutable = 1, Defs = [VCC]
+} // End isCommutable = 1
defm V_READLANE_B32 : VOP2SI_3VI_m <
vop3 <0x001, 0x289>,
@@ -1575,10 +1590,10 @@ defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32",
VOP_I32_I32_I32
>;
defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst <vop23<0x23, 0x28c>, "v_mbcnt_lo_u32_b32",
- VOP_I32_I32_I32
+ VOP_I32_I32_I32, int_amdgcn_mbcnt_lo
>;
defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst <vop23<0x24, 0x28d>, "v_mbcnt_hi_u32_b32",
- VOP_I32_I32_I32
+ VOP_I32_I32_I32, int_amdgcn_mbcnt_hi
>;
defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32",
VOP_F32_F32_I32, AMDGPUldexp
@@ -1704,15 +1719,15 @@ defm V_DIV_FIXUP_F32 : VOP3Inst <
vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup
>;
-let SchedRW = [WriteDouble] in {
+let SchedRW = [WriteDoubleAdd] in {
defm V_DIV_FIXUP_F64 : VOP3Inst <
vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup
>;
-} // let SchedRW = [WriteDouble]
+} // End SchedRW = [WriteDouble]
-let SchedRW = [WriteDouble] in {
+let SchedRW = [WriteDoubleAdd] in {
let isCommutable = 1 in {
defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64",
@@ -1735,7 +1750,7 @@ defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64",
VOP_F64_F64_I32, AMDGPUldexp
>;
-} // let SchedRW = [WriteDouble]
+} // let SchedRW = [WriteDoubleAdd]
let isCommutable = 1, SchedRW = [WriteQuarterRate32] in {
@@ -1756,16 +1771,21 @@ defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32",
} // isCommutable = 1, SchedRW = [WriteQuarterRate32]
let SchedRW = [WriteFloatFMA, WriteSALU] in {
-defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d, 0x1e0>, "v_div_scale_f32", []>;
+defm V_DIV_SCALE_F32 : VOP3bInst <vop3<0x16d, 0x1e0>, "v_div_scale_f32",
+ VOP3b_F32_I1_F32_F32_F32
+>;
}
let SchedRW = [WriteDouble, WriteSALU] in {
// Double precision division pre-scale.
-defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e, 0x1e1>, "v_div_scale_f64", []>;
+defm V_DIV_SCALE_F64 : VOP3bInst <vop3<0x16e, 0x1e1>, "v_div_scale_f64",
+ VOP3b_F64_I1_F64_F64_F64
+>;
} // let SchedRW = [WriteDouble]
-let isCommutable = 1, Uses = [VCC] in {
+let isCommutable = 1, Uses = [VCC, EXEC] in {
+let SchedRW = [WriteFloatFMA] in {
// v_div_fmas_f32:
// result = src0 * src1 + src2
// if (vcc)
@@ -1774,6 +1794,7 @@ let isCommutable = 1, Uses = [VCC] in {
defm V_DIV_FMAS_F32 : VOP3_VCC_Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32",
VOP_F32_F32_F32_F32, AMDGPUdiv_fmas
>;
+}
let SchedRW = [WriteDouble] in {
// v_div_fmas_f64:
@@ -1786,7 +1807,7 @@ defm V_DIV_FMAS_F64 : VOP3_VCC_Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64",
>;
} // End SchedRW = [WriteDouble]
-} // End isCommutable = 1
+} // End isCommutable = 1, Uses = [VCC, EXEC]
//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>;
//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>;
@@ -1835,13 +1856,13 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst),
(ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", []
>;
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
// 64-bit vector move instruction. This is mainly used by the SIFoldOperands
// pass to enable folding of inline immediates.
def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>;
} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0
-let hasSideEffects = 1 in {
+let hasSideEffects = 1, SALU = 1 in {
def SGPR_USE : InstSI <(outs),(ins), "", []>;
}
@@ -1921,39 +1942,9 @@ def SI_KILL : InstSI <
let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
-//defm SI_ : RegisterLoadStore <VGPR_32, FRAMEri, ADDRIndirect>;
-
-let UseNamedOperandTable = 1 in {
-
-def SI_RegisterLoad : InstSI <
+class SI_INDIRECT_SRC<RegisterClass rc> : InstSI <
(outs VGPR_32:$dst, SReg_64:$temp),
- (ins FRAMEri32:$addr, i32imm:$chan),
- "", []
-> {
- let isRegisterLoad = 1;
- let mayLoad = 1;
-}
-
-class SIRegStore<dag outs> : InstSI <
- outs,
- (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan),
- "", []
-> {
- let isRegisterStore = 1;
- let mayStore = 1;
-}
-
-let usesCustomInserter = 1 in {
-def SI_RegisterStorePseudo : SIRegStore<(outs)>;
-} // End usesCustomInserter = 1
-def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>;
-
-
-} // End UseNamedOperandTable = 1
-
-def SI_INDIRECT_SRC : InstSI <
- (outs VGPR_32:$dst, SReg_64:$temp),
- (ins unknown:$src, VSrc_32:$idx, i32imm:$off),
+ (ins rc:$src, VSrc_32:$idx, i32imm:$off),
"si_indirect_src $dst, $temp, $src, $idx, $off",
[]
>;
@@ -1967,6 +1958,13 @@ class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
let Constraints = "$src = $dst";
}
+// TODO: We can support indirect SGPR access.
+def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
+def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
+def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
+def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
+def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>;
+
def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
@@ -1977,19 +1975,24 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
- let UseNamedOperandTable = 1 in {
+ let UseNamedOperandTable = 1, Uses = [EXEC] in {
def _SAVE : InstSI <
(outs),
- (ins sgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
- SReg_32:$scratch_offset),
+ (ins sgpr_class:$src, i32imm:$frame_idx),
"", []
- >;
+ > {
+ let mayStore = 1;
+ let mayLoad = 0;
+ }
def _RESTORE : InstSI <
(outs sgpr_class:$dst),
- (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
+ (ins i32imm:$frame_idx),
"", []
- >;
+ > {
+ let mayStore = 0;
+ let mayLoad = 1;
+ }
} // End UseNamedOperandTable = 1
}
@@ -2003,19 +2006,25 @@ defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
- let UseNamedOperandTable = 1, VGPRSpill = 1 in {
+ let UseNamedOperandTable = 1, VGPRSpill = 1, Uses = [EXEC] in {
def _SAVE : InstSI <
(outs),
(ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
SReg_32:$scratch_offset),
"", []
- >;
+ > {
+ let mayStore = 1;
+ let mayLoad = 0;
+ }
def _RESTORE : InstSI <
(outs vgpr_class:$dst),
(ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
"", []
- >;
+ > {
+ let mayStore = 0;
+ let mayLoad = 1;
+ }
} // End UseNamedOperandTable = 1, VGPRSpill = 1
}
@@ -2030,9 +2039,11 @@ let Defs = [SCC] in {
def SI_CONSTDATA_PTR : InstSI <
(outs SReg_64:$dst),
- (ins),
- "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))]
->;
+ (ins const_ga:$ptr),
+ "", [(set SReg_64:$dst, (i64 (SIconstdata_ptr (tglobaladdr:$ptr))))]
+> {
+ let SALU = 1;
+}
} // End Defs = [SCC]
@@ -2072,84 +2083,63 @@ def : Pat <
// SMRD Patterns
//===----------------------------------------------------------------------===//
-multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
+multiclass SMRD_Pattern <string Instr, ValueType vt> {
- // 1. SI-CI: Offset as 8bit DWORD immediate
+ // 1. IMM offset
def : Pat <
- (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))),
- (vt (Instr_IMM $sbase, (as_dword_i32imm $offset)))
+ (smrd_load (SMRDImm i64:$sbase, i32:$offset)),
+ (vt (!cast<SMRD>(Instr#"_IMM") $sbase, $offset))
>;
- // 2. Offset loaded in an 32bit SGPR
+ // 2. SGPR offset
def : Pat <
- (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
- (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
+ (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
+ (vt (!cast<SMRD>(Instr#"_SGPR") $sbase, $offset))
>;
- // 3. No offset at all
def : Pat <
- (constant_load i64:$sbase),
- (vt (Instr_IMM $sbase, 0))
- >;
+ (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
+ (vt (!cast<SMRD>(Instr#"_IMM_ci") $sbase, $offset))
+ > {
+ let Predicates = [isCIOnly];
+ }
}
-multiclass SMRD_Pattern_vi <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
-
- // 1. VI: Offset as 20bit immediate in bytes
- def : Pat <
- (constant_load (add i64:$sbase, (i64 IMM20bit:$offset))),
- (vt (Instr_IMM $sbase, (as_i32imm $offset)))
- >;
-
- // 2. Offset loaded in an 32bit SGPR
- def : Pat <
- (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
- (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
- >;
-
- // 3. No offset at all
- def : Pat <
- (constant_load i64:$sbase),
- (vt (Instr_IMM $sbase, 0))
- >;
-}
-
-let Predicates = [isSICI] in {
-defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
-defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
-} // End Predicates = [isSICI]
+// Global and constant loads can be selected to either MUBUF or SMRD
+// instructions, but SMRD instructions are faster so we want the instruction
+// selector to prefer those.
+let AddedComplexity = 100 in {
-let Predicates = [isVI] in {
-defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
-defm : SMRD_Pattern_vi <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
-} // End Predicates = [isVI]
+defm : SMRD_Pattern <"S_LOAD_DWORD", i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX8", v32i8>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>;
+defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
-let Predicates = [isSICI] in {
+// 1. Offset as an immediate
+def : Pat <
+ (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)),
+ (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset)
+>;
-// 1. Offset as 8bit DWORD immediate
+// 2. Offset loaded in an 32bit SGPR
def : Pat <
- (SIload_constant v4i32:$sbase, IMM8bitDWORD:$offset),
- (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset))
+ (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)),
+ (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset)
>;
-} // End Predicates = [isSICI]
+let Predicates = [isCI] in {
-// 2. Offset loaded in an 32bit SGPR
def : Pat <
- (SIload_constant v4i32:$sbase, imm:$offset),
- (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
+ (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)),
+ (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset)
>;
+} // End Predicates = [isCI]
+
+} // End let AddedComplexity = 10000
+
//===----------------------------------------------------------------------===//
// SOP1 Patterns
//===----------------------------------------------------------------------===//
@@ -2161,6 +2151,11 @@ def : Pat <
(S_MOV_B32 0), sub1))
>;
+def : Pat <
+ (i32 (smax i32:$x, (i32 (ineg i32:$x)))),
+ (S_ABS_I32 $x)
+>;
+
//===----------------------------------------------------------------------===//
// SOP2 Patterns
//===----------------------------------------------------------------------===//
@@ -2488,6 +2483,11 @@ def : Pat <
/********** Extraction, Insertion, Building and Casting **********/
/********** ============================================ **********/
+//def : Extract_Element<i64, v2i64, 0, sub0_sub1>;
+//def : Extract_Element<i64, v2i64, 1, sub2_sub3>;
+//def : Extract_Element<f64, v2f64, 0, sub0_sub1>;
+//def : Extract_Element<f64, v2f64, 1, sub2_sub3>;
+
foreach Index = 0-2 in {
def Extract_Element_v2i32_#Index : Extract_Element <
i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -2568,11 +2568,25 @@ def : BitConvert <v2i32, i64, VReg_64>;
def : BitConvert <i64, v2i32, VReg_64>;
def : BitConvert <v2f32, i64, VReg_64>;
def : BitConvert <i64, v2f32, VReg_64>;
+def : BitConvert <v2f32, f64, VReg_64>;
def : BitConvert <v2i32, f64, VReg_64>;
+def : BitConvert <f64, v2f32, VReg_64>;
def : BitConvert <f64, v2i32, VReg_64>;
def : BitConvert <v4f32, v4i32, VReg_128>;
def : BitConvert <v4i32, v4f32, VReg_128>;
+
+def : BitConvert <v2i64, v4i32, SReg_128>;
+def : BitConvert <v4i32, v2i64, SReg_128>;
+
+def : BitConvert <v2f64, v4f32, VReg_128>;
+def : BitConvert <v2f64, v4i32, VReg_128>;
+def : BitConvert <v4f32, v2f64, VReg_128>;
+def : BitConvert <v4i32, v2f64, VReg_128>;
+
+
+
+
def : BitConvert <v8f32, v8i32, SReg_256>;
def : BitConvert <v8i32, v8f32, SReg_256>;
def : BitConvert <v8i32, v32i8, SReg_256>;
@@ -2601,10 +2615,9 @@ def : Pat <
// Prevent expanding both fneg and fabs.
-// FIXME: Should use S_OR_B32
def : Pat <
(fneg (fabs f32:$src)),
- (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */
+ (S_OR_B32 $src, 0x80000000) /* Set sign bit */
>;
// FIXME: Should use S_OR_B32
@@ -2836,10 +2849,6 @@ class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
// -1. For the non-rtn variants, the manual says it does
// DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max
// will always do the increment so I'm assuming it's the same.
-//
-// We also load this -1 with s_mov_b32 / s_mov_b64 even though this
-// needs to be a VGPR. The SGPR copy pass will fix this, and it's
-// easier since there is no v_mov_b64.
class DSAtomicIncRetPat<DS inst, ValueType vt,
Instruction LoadImm, PatFrag frag> : Pat <
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)),
@@ -2855,9 +2864,9 @@ class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat <
// 32-bit atomics.
def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32,
- S_MOV_B32, si_atomic_load_add_local>;
+ V_MOV_B32_e32, si_atomic_load_add_local>;
def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32,
- S_MOV_B32, si_atomic_load_sub_local>;
+ V_MOV_B32_e32, si_atomic_load_sub_local>;
def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>;
def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>;
@@ -2874,9 +2883,9 @@ def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>;
// 64-bit atomics.
def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64,
- S_MOV_B64, si_atomic_load_add_local>;
+ V_MOV_B64_PSEUDO, si_atomic_load_add_local>;
def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64,
- S_MOV_B64, si_atomic_load_sub_local>;
+ V_MOV_B64_PSEUDO, si_atomic_load_sub_local>;
def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>;
def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>;
@@ -3019,90 +3028,46 @@ def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
-let SubtargetPredicate = isCI in {
-
-defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8",
- VOP_I32_I32_I32
->;
-defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8",
- VOP_I32_I32_I32
->;
-defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8",
- VOP_I32_I32_I32
->;
-
-let isCommutable = 1 in {
-defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32",
- VOP_I64_I32_I32_I64
->;
-
-// XXX - Does this set VCC?
-defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
- VOP_I64_I32_I32_I64
->;
-} // End isCommutable = 1
-
-// Remaining instructions:
-// FLAT_*
-// S_CBRANCH_CDBGUSER
-// S_CBRANCH_CDBGSYS
-// S_CBRANCH_CDBGSYS_OR_USER
-// S_CBRANCH_CDBGSYS_AND_USER
-// S_DCACHE_INV_VOL
-// DS_NOP
-// DS_GWS_SEMA_RELEASE_ALL
-// DS_WRAP_RTN_B32
-// DS_CNDXCHG32_RTN_B64
-// DS_WRITE_B96
-// DS_WRITE_B128
-// DS_CONDXCHG32_RTN_B128
-// DS_READ_B96
-// DS_READ_B128
-// BUFFER_LOAD_DWORDX3
-// BUFFER_STORE_DWORDX3
-
-} // End isCI
-
/********** ====================== **********/
/********** Indirect adressing **********/
/********** ====================== **********/
-multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST IndDst> {
+multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
// 1. Extract with offset
def : Pat<
- (eltvt (vector_extract vt:$vec, (add i32:$idx, imm:$off))),
- (SI_INDIRECT_SRC $vec, $idx, imm:$off)
+ (eltvt (extractelt vt:$vec, (add i32:$idx, imm:$off))),
+ (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, imm:$off)
>;
// 2. Extract without offset
def : Pat<
- (eltvt (vector_extract vt:$vec, i32:$idx)),
- (SI_INDIRECT_SRC $vec, $idx, 0)
+ (eltvt (extractelt vt:$vec, i32:$idx)),
+ (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, 0)
>;
// 3. Insert with offset
def : Pat<
- (vector_insert vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)),
- (IndDst $vec, $idx, imm:$off, $val)
+ (insertelt vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)),
+ (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, imm:$off, $val)
>;
// 4. Insert without offset
def : Pat<
- (vector_insert vt:$vec, eltvt:$val, i32:$idx),
- (IndDst $vec, $idx, 0, $val)
+ (insertelt vt:$vec, eltvt:$val, i32:$idx),
+ (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, 0, $val)
>;
}
-defm : SI_INDIRECT_Pattern <v2f32, f32, SI_INDIRECT_DST_V2>;
-defm : SI_INDIRECT_Pattern <v4f32, f32, SI_INDIRECT_DST_V4>;
-defm : SI_INDIRECT_Pattern <v8f32, f32, SI_INDIRECT_DST_V8>;
-defm : SI_INDIRECT_Pattern <v16f32, f32, SI_INDIRECT_DST_V16>;
+defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
+defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
+defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
+defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;
-defm : SI_INDIRECT_Pattern <v2i32, i32, SI_INDIRECT_DST_V2>;
-defm : SI_INDIRECT_Pattern <v4i32, i32, SI_INDIRECT_DST_V4>;
-defm : SI_INDIRECT_Pattern <v8i32, i32, SI_INDIRECT_DST_V8>;
-defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>;
+defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
+defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
+defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
+defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;
//===----------------------------------------------------------------------===//
// Conversion Patterns
@@ -3215,12 +3180,12 @@ def : Pat <
def : Pat <
(i1 (trunc i32:$a)),
- (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1)
+ (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1), $a), 1)
>;
def : Pat <
(i1 (trunc i64:$a)),
- (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1),
+ (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1),
(EXTRACT_SUBREG $a, sub0)), 1)
>;
@@ -3301,24 +3266,6 @@ def : Pat <
} // End Predicates = [isSI]
-let Predicates = [isCI] in {
-
-// Convert (x - floor(x)) to fract(x)
-def : Pat <
- (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
- (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
- (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
->;
-
-// Convert (x + (-floor(x))) to fract(x)
-def : Pat <
- (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
- (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
- (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
->;
-
-} // End Predicates = [isCI]
-
//============================================================================//
// Miscellaneous Optimization Patterns
//============================================================================//
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index c319b32..126f624 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -103,6 +103,10 @@ public:
return "SI Lower control flow instructions";
}
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
};
} // End anonymous namespace
@@ -140,8 +144,7 @@ void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
DebugLoc DL = From.getDebugLoc();
BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
- .addOperand(To)
- .addReg(AMDGPU::EXEC);
+ .addOperand(To);
}
void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
@@ -159,8 +162,7 @@ void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
// If the exec mask is non-zero, skip the next two instructions
BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
- .addImm(3)
- .addReg(AMDGPU::EXEC);
+ .addImm(3);
// Exec mask is zero: Export to NULL target...
BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
@@ -269,8 +271,7 @@ void SILowerControlFlowPass::Loop(MachineInstr &MI) {
.addReg(Src);
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
- .addOperand(MI.getOperand(1))
- .addReg(AMDGPU::EXEC);
+ .addOperand(MI.getOperand(1));
MI.eraseFromParent();
}
@@ -316,7 +317,7 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
.addImm(0);
}
} else {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
.addImm(0)
.addOperand(Op);
}
@@ -362,9 +363,9 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int
.addReg(AMDGPU::VCC_LO);
// Compare the just read M0 value to all possible Idx values
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
- .addReg(AMDGPU::M0)
- .addReg(Idx);
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
+ .addReg(AMDGPU::M0)
+ .addReg(Idx);
// Update EXEC, save the original EXEC value to VCC
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
@@ -385,8 +386,7 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int
// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
- .addImm(-7)
- .addReg(AMDGPU::EXEC);
+ .addImm(-7);
// Restore EXEC
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
@@ -438,7 +438,6 @@ void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
MachineInstr *MovRel =
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
.addReg(Reg)
- .addReg(AMDGPU::M0, RegState::Implicit)
.addReg(Vec, RegState::Implicit);
LoadM0(MI, MovRel, Off);
@@ -460,7 +459,6 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
.addReg(Reg, RegState::Define)
.addReg(Val)
- .addReg(AMDGPU::M0, RegState::Implicit)
.addReg(Dst, RegState::Implicit);
LoadM0(MI, MovRel, Off);
@@ -486,11 +484,11 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
Next = std::next(I);
MachineInstr &MI = *I;
- if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode()))
+ if (TII->isWQM(MI) || TII->isDS(MI))
NeedWQM = true;
// Flat uses m0 in case it needs to access LDS.
- if (TII->isFLAT(MI.getOpcode()))
+ if (TII->isFLAT(MI))
NeedFlat = true;
switch (MI.getOpcode()) {
@@ -541,7 +539,11 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
Branch(MI);
break;
- case AMDGPU::SI_INDIRECT_SRC:
+ case AMDGPU::SI_INDIRECT_SRC_V1:
+ case AMDGPU::SI_INDIRECT_SRC_V2:
+ case AMDGPU::SI_INDIRECT_SRC_V4:
+ case AMDGPU::SI_INDIRECT_SRC_V8:
+ case AMDGPU::SI_INDIRECT_SRC_V16:
IndirectSrc(MI);
break;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 67421e2..a2fa5fd 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -48,6 +48,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 587ea63..935aad4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -29,10 +29,114 @@ void SIMachineFunctionInfo::anchor() {}
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
TIDReg(AMDGPU::NoRegister),
- HasSpilledVGPRs(false),
+ ScratchRSrcReg(AMDGPU::NoRegister),
+ ScratchWaveOffsetReg(AMDGPU::NoRegister),
+ PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
+ DispatchPtrUserSGPR(AMDGPU::NoRegister),
+ QueuePtrUserSGPR(AMDGPU::NoRegister),
+ KernargSegmentPtrUserSGPR(AMDGPU::NoRegister),
+ DispatchIDUserSGPR(AMDGPU::NoRegister),
+ FlatScratchInitUserSGPR(AMDGPU::NoRegister),
+ PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister),
+ GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister),
+ GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister),
+ GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister),
+ WorkGroupIDXSystemSGPR(AMDGPU::NoRegister),
+ WorkGroupIDYSystemSGPR(AMDGPU::NoRegister),
+ WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
+ WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
+ PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
+ LDSWaveSpillSize(0),
PSInputAddr(0),
NumUserSGPRs(0),
- LDSWaveSpillSize(0) { }
+ NumSystemSGPRs(0),
+ HasSpilledSGPRs(false),
+ HasSpilledVGPRs(false),
+ PrivateSegmentBuffer(false),
+ DispatchPtr(false),
+ QueuePtr(false),
+ DispatchID(false),
+ KernargSegmentPtr(false),
+ FlatScratchInit(false),
+ GridWorkgroupCountX(false),
+ GridWorkgroupCountY(false),
+ GridWorkgroupCountZ(false),
+ WorkGroupIDX(true),
+ WorkGroupIDY(false),
+ WorkGroupIDZ(false),
+ WorkGroupInfo(false),
+ PrivateSegmentWaveByteOffset(false),
+ WorkItemIDX(true),
+ WorkItemIDY(false),
+ WorkItemIDZ(false) {
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+ const Function *F = MF.getFunction();
+
+ const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+
+ if (getShaderType() == ShaderType::COMPUTE)
+ KernargSegmentPtr = true;
+
+ if (F->hasFnAttribute("amdgpu-work-group-id-y"))
+ WorkGroupIDY = true;
+
+ if (F->hasFnAttribute("amdgpu-work-group-id-z"))
+ WorkGroupIDZ = true;
+
+ if (F->hasFnAttribute("amdgpu-work-item-id-y"))
+ WorkItemIDY = true;
+
+ if (F->hasFnAttribute("amdgpu-work-item-id-z"))
+ WorkItemIDZ = true;
+
+ bool MaySpill = ST.isVGPRSpillingEnabled(this);
+ bool HasStackObjects = FrameInfo->hasStackObjects();
+
+ if (HasStackObjects || MaySpill)
+ PrivateSegmentWaveByteOffset = true;
+
+ if (ST.isAmdHsaOS()) {
+ if (HasStackObjects || MaySpill)
+ PrivateSegmentBuffer = true;
+
+ if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
+ DispatchPtr = true;
+ }
+
+ // X, XY, and XYZ are the only supported combinations, so make sure Y is
+ // enabled if Z is.
+ if (WorkItemIDZ)
+ WorkItemIDY = true;
+}
+
+unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
+ const SIRegisterInfo &TRI) {
+ PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+ NumUserSGPRs += 4;
+ return PrivateSegmentBufferUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
+ DispatchPtrUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ NumUserSGPRs += 2;
+ return DispatchPtrUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
+ QueuePtrUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ NumUserSGPRs += 2;
+ return QueuePtrUserSGPR;
+}
+
+unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
+ KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ NumUserSGPRs += 2;
+ return KernargSegmentPtrUserSGPR;
+}
SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
MachineFunction *MF,
@@ -53,7 +157,6 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
if (!LaneVGPRs.count(LaneVGPRIdx)) {
unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
- MRI.setPhysRegUsed(LaneVGPR);
// Add this register as live-in to all blocks to avoid machine verifer
// complaining about use of an undefined physical register.
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 667da4c..9c528d6 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -26,13 +26,83 @@ class MachineRegisterInfo;
/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
/// tells the hardware which interpolation parameters to load.
class SIMachineFunctionInfo : public AMDGPUMachineFunction {
+ // FIXME: This should be removed and getPreloadedValue moved here.
+ friend struct SIRegisterInfo;
void anchor() override;
unsigned TIDReg;
- bool HasSpilledVGPRs;
+
+ // Registers that may be reserved for spilling purposes. These may be the same
+ // as the input registers.
+ unsigned ScratchRSrcReg;
+ unsigned ScratchWaveOffsetReg;
+
+ // Input registers setup for the HSA ABI.
+ // User SGPRs in allocation order.
+ unsigned PrivateSegmentBufferUserSGPR;
+ unsigned DispatchPtrUserSGPR;
+ unsigned QueuePtrUserSGPR;
+ unsigned KernargSegmentPtrUserSGPR;
+ unsigned DispatchIDUserSGPR;
+ unsigned FlatScratchInitUserSGPR;
+ unsigned PrivateSegmentSizeUserSGPR;
+ unsigned GridWorkGroupCountXUserSGPR;
+ unsigned GridWorkGroupCountYUserSGPR;
+ unsigned GridWorkGroupCountZUserSGPR;
+
+ // System SGPRs in allocation order.
+ unsigned WorkGroupIDXSystemSGPR;
+ unsigned WorkGroupIDYSystemSGPR;
+ unsigned WorkGroupIDZSystemSGPR;
+ unsigned WorkGroupInfoSystemSGPR;
+ unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
public:
+ // FIXME: Make private
+ unsigned LDSWaveSpillSize;
+ unsigned PSInputAddr;
+ std::map<unsigned, unsigned> LaneVGPRs;
+ unsigned ScratchOffsetReg;
+ unsigned NumUserSGPRs;
+ unsigned NumSystemSGPRs;
+
+private:
+ bool HasSpilledSGPRs;
+ bool HasSpilledVGPRs;
+
+ // Feature bits required for inputs passed in user SGPRs.
+ bool PrivateSegmentBuffer : 1;
+ bool DispatchPtr : 1;
+ bool QueuePtr : 1;
+ bool DispatchID : 1;
+ bool KernargSegmentPtr : 1;
+ bool FlatScratchInit : 1;
+ bool GridWorkgroupCountX : 1;
+ bool GridWorkgroupCountY : 1;
+ bool GridWorkgroupCountZ : 1;
+
+ // Feature bits required for inputs passed in system SGPRs.
+ bool WorkGroupIDX : 1; // Always initialized.
+ bool WorkGroupIDY : 1;
+ bool WorkGroupIDZ : 1;
+ bool WorkGroupInfo : 1;
+ bool PrivateSegmentWaveByteOffset : 1;
+
+ bool WorkItemIDX : 1; // Always initialized.
+ bool WorkItemIDY : 1;
+ bool WorkItemIDZ : 1;
+
+ MCPhysReg getNextUserSGPR() const {
+ assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
+ return AMDGPU::SGPR0 + NumUserSGPRs;
+ }
+
+ MCPhysReg getNextSystemSGPR() const {
+ return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
+ }
+
+public:
struct SpilledReg {
unsigned VGPR;
int Lane;
@@ -46,16 +116,162 @@ public:
SIMachineFunctionInfo(const MachineFunction &MF);
SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex,
unsigned SubIdx);
- unsigned PSInputAddr;
- unsigned NumUserSGPRs;
- std::map<unsigned, unsigned> LaneVGPRs;
- unsigned LDSWaveSpillSize;
- unsigned ScratchOffsetReg;
bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; };
unsigned getTIDReg() const { return TIDReg; };
void setTIDReg(unsigned Reg) { TIDReg = Reg; }
- bool hasSpilledVGPRs() const { return HasSpilledVGPRs; }
- void setHasSpilledVGPRs(bool Spill = true) { HasSpilledVGPRs = Spill; }
+
+ // Add user SGPRs.
+ unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI);
+ unsigned addDispatchPtr(const SIRegisterInfo &TRI);
+ unsigned addQueuePtr(const SIRegisterInfo &TRI);
+ unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
+
+ // Add system SGPRs.
+ unsigned addWorkGroupIDX() {
+ WorkGroupIDXSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return WorkGroupIDXSystemSGPR;
+ }
+
+ unsigned addWorkGroupIDY() {
+ WorkGroupIDYSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return WorkGroupIDYSystemSGPR;
+ }
+
+ unsigned addWorkGroupIDZ() {
+ WorkGroupIDZSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return WorkGroupIDZSystemSGPR;
+ }
+
+ unsigned addWorkGroupInfo() {
+ WorkGroupInfoSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return WorkGroupInfoSystemSGPR;
+ }
+
+ unsigned addPrivateSegmentWaveByteOffset() {
+ PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR();
+ NumSystemSGPRs += 1;
+ return PrivateSegmentWaveByteOffsetSystemSGPR;
+ }
+
+ bool hasPrivateSegmentBuffer() const {
+ return PrivateSegmentBuffer;
+ }
+
+ bool hasDispatchPtr() const {
+ return DispatchPtr;
+ }
+
+ bool hasQueuePtr() const {
+ return QueuePtr;
+ }
+
+ bool hasDispatchID() const {
+ return DispatchID;
+ }
+
+ bool hasKernargSegmentPtr() const {
+ return KernargSegmentPtr;
+ }
+
+ bool hasFlatScratchInit() const {
+ return FlatScratchInit;
+ }
+
+ bool hasGridWorkgroupCountX() const {
+ return GridWorkgroupCountX;
+ }
+
+ bool hasGridWorkgroupCountY() const {
+ return GridWorkgroupCountY;
+ }
+
+ bool hasGridWorkgroupCountZ() const {
+ return GridWorkgroupCountZ;
+ }
+
+ bool hasWorkGroupIDX() const {
+ return WorkGroupIDX;
+ }
+
+ bool hasWorkGroupIDY() const {
+ return WorkGroupIDY;
+ }
+
+ bool hasWorkGroupIDZ() const {
+ return WorkGroupIDZ;
+ }
+
+ bool hasWorkGroupInfo() const {
+ return WorkGroupInfo;
+ }
+
+ bool hasPrivateSegmentWaveByteOffset() const {
+ return PrivateSegmentWaveByteOffset;
+ }
+
+ bool hasWorkItemIDX() const {
+ return WorkItemIDX;
+ }
+
+ bool hasWorkItemIDY() const {
+ return WorkItemIDY;
+ }
+
+ bool hasWorkItemIDZ() const {
+ return WorkItemIDZ;
+ }
+
+ unsigned getNumUserSGPRs() const {
+ return NumUserSGPRs;
+ }
+
+ unsigned getNumPreloadedSGPRs() const {
+ return NumUserSGPRs + NumSystemSGPRs;
+ }
+
+ unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const {
+ return PrivateSegmentWaveByteOffsetSystemSGPR;
+ }
+
+ /// \brief Returns the physical register reserved for use as the resource
+ /// descriptor for scratch accesses.
+ unsigned getScratchRSrcReg() const {
+ return ScratchRSrcReg;
+ }
+
+ void setScratchRSrcReg(unsigned Reg) {
+ assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+ ScratchRSrcReg = Reg;
+ }
+
+ unsigned getScratchWaveOffsetReg() const {
+ return ScratchWaveOffsetReg;
+ }
+
+ void setScratchWaveOffsetReg(unsigned Reg) {
+ assert(Reg != AMDGPU::NoRegister && "Should never be unset");
+ ScratchWaveOffsetReg = Reg;
+ }
+
+ bool hasSpilledSGPRs() const {
+ return HasSpilledSGPRs;
+ }
+
+ void setHasSpilledSGPRs(bool Spill = true) {
+ HasSpilledSGPRs = Spill;
+ }
+
+ bool hasSpilledVGPRs() const {
+ return HasSpilledVGPRs;
+ }
+
+ void setHasSpilledVGPRs(bool Spill = true) {
+ HasSpilledVGPRs = Spill;
+ }
unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
};
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
deleted file mode 100644
index 2cd600d..0000000
--- a/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// This pass loads scratch pointer and scratch offset into a register or a
-/// frame index which can be used anywhere in the program. These values will
-/// be used for spilling VGPRs.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-
-using namespace llvm;
-
-namespace {
-
-class SIPrepareScratchRegs : public MachineFunctionPass {
-
-private:
- static char ID;
-
-public:
- SIPrepareScratchRegs() : MachineFunctionPass(ID) { }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- const char *getPassName() const override {
- return "SI prepare scratch registers";
- }
-
-};
-
-} // End anonymous namespace
-
-char SIPrepareScratchRegs::ID = 0;
-
-FunctionPass *llvm::createSIPrepareScratchRegs() {
- return new SIPrepareScratchRegs();
-}
-
-bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
- SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
- const SIRegisterInfo *TRI = &TII->getRegisterInfo();
- MachineRegisterInfo &MRI = MF.getRegInfo();
- MachineFrameInfo *FrameInfo = MF.getFrameInfo();
- MachineBasicBlock *Entry = MF.begin();
- MachineBasicBlock::iterator I = Entry->begin();
- DebugLoc DL = I->getDebugLoc();
-
- // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to
- // run this pass.
- if (!MFI->hasSpilledVGPRs())
- return false;
-
- unsigned ScratchPtrPreloadReg =
- TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
- unsigned ScratchOffsetPreloadReg =
- TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
-
- if (!Entry->isLiveIn(ScratchPtrPreloadReg))
- Entry->addLiveIn(ScratchPtrPreloadReg);
-
- if (!Entry->isLiveIn(ScratchOffsetPreloadReg))
- Entry->addLiveIn(ScratchOffsetPreloadReg);
-
- // Load the scratch offset.
- unsigned ScratchOffsetReg =
- TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass);
- int ScratchOffsetFI = -1;
-
- if (ScratchOffsetReg != AMDGPU::NoRegister) {
- // Found an SGPR to use
- MRI.setPhysRegUsed(ScratchOffsetReg);
- BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg)
- .addReg(ScratchOffsetPreloadReg);
- } else {
- // No SGPR is available, we must spill.
- ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4);
- BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE))
- .addReg(ScratchOffsetPreloadReg)
- .addFrameIndex(ScratchOffsetFI)
- .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
- .addReg(AMDGPU::SGPR0, RegState::Undef);
- }
-
-
- // Now that we have the scratch pointer and offset values, we need to
- // add them to all the SI_SPILL_V* instructions.
-
- RegScavenger RS;
- unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4);
- RS.addScavengingFrameIndex(ScratchRsrcFI);
-
- for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
- BI != BE; ++BI) {
-
- MachineBasicBlock &MBB = *BI;
- // Add the scratch offset reg as a live-in so that the register scavenger
- // doesn't re-use it.
- if (!MBB.isLiveIn(ScratchOffsetReg) &&
- ScratchOffsetReg != AMDGPU::NoRegister)
- MBB.addLiveIn(ScratchOffsetReg);
- RS.enterBasicBlock(&MBB);
-
- for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
- I != E; ++I) {
- MachineInstr &MI = *I;
- RS.forward(I);
- DebugLoc DL = MI.getDebugLoc();
- if (!TII->isVGPRSpill(MI.getOpcode()))
- continue;
-
- // Scratch resource
- unsigned ScratchRsrcReg =
- RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0);
-
- uint64_t Rsrc23 = TII->getScratchRsrcWords23();
-
- unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
- unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
- unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
- unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
-
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0)
- .addExternalSymbol("SCRATCH_RSRC_DWORD0")
- .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1)
- .addExternalSymbol("SCRATCH_RSRC_DWORD1")
- .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2)
- .addImm(Rsrc23 & 0xffffffff)
- .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3)
- .addImm(Rsrc23 >> 32)
- .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
- // Scratch Offset
- if (ScratchOffsetReg == AMDGPU::NoRegister) {
- ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
- BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE),
- ScratchOffsetReg)
- .addFrameIndex(ScratchOffsetFI)
- .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
- .addReg(AMDGPU::SGPR0, RegState::Undef);
- } else if (!MBB.isLiveIn(ScratchOffsetReg)) {
- MBB.addLiveIn(ScratchOffsetReg);
- }
-
- if (ScratchRsrcReg == AMDGPU::NoRegister ||
- ScratchOffsetReg == AMDGPU::NoRegister) {
- LLVMContext &Ctx = MF.getFunction()->getContext();
- Ctx.emitError("ran out of SGPRs for spilling VGPRs");
- ScratchRsrcReg = AMDGPU::SGPR0;
- ScratchOffsetReg = AMDGPU::SGPR0;
- }
- MI.getOperand(2).setReg(ScratchRsrcReg);
- MI.getOperand(2).setIsKill(true);
- MI.getOperand(2).setIsUndef(false);
- MI.getOperand(3).setReg(ScratchOffsetReg);
- MI.getOperand(3).setIsUndef(false);
- MI.getOperand(3).setIsKill(false);
- MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true));
- MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true));
- MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true));
- MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true));
- }
- }
- return true;
-}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index e9e8412..3cdffef 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -12,7 +12,6 @@
//
//===----------------------------------------------------------------------===//
-
#include "SIRegisterInfo.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
@@ -33,6 +32,40 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co
Reserved.set(*R);
}
+unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
+ const MachineFunction &MF) const {
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+ if (ST.hasSGPRInitBug()) {
+ unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4;
+ unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
+ return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+ }
+
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // 98/99 need to be reserved for flat_scr, and 100/101 for vcc. This is the
+ // next sgpr128 down.
+ return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
+ }
+
+ return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99;
+}
+
+unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
+ const MachineFunction &MF) const {
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+ if (ST.hasSGPRInitBug()) {
+ unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5;
+ return AMDGPU::SGPR_32RegClass.getRegister(Idx);
+ }
+
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // Next register before reservations for flat_scr and vcc.
+ return AMDGPU::SGPR97;
+ }
+
+ return AMDGPU::SGPR95;
+}
+
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
@@ -42,13 +75,22 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::EXEC);
reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
- // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs
- reserveRegisterTuples(Reserved, AMDGPU::VGPR254);
- reserveRegisterTuples(Reserved, AMDGPU::VGPR255);
+ // Reserve the last 2 registers so we will always have at least 2 more that
+ // will physically contain VCC.
+ reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103);
+
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation
+ // for VCC/FLAT_SCR.
+ reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99);
+ reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101);
+ }
// Tonga and Iceland can only allocate a fixed number of SGPRs due
// to a hw bug.
- if (MF.getSubtarget<AMDGPUSubtarget>().hasSGPRInitBug()) {
+ if (ST.hasSGPRInitBug()) {
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
// Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs).
// Assume XNACK_MASK is unused.
@@ -60,34 +102,57 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
}
}
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+ if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
+ // Reserve 1 SGPR for scratch wave offset in case we need to spill.
+ reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
+ }
+
+ unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+ if (ScratchRSrcReg != AMDGPU::NoRegister) {
+ // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
+ // to spill.
+ // TODO: May need to reserve a VGPR if doing LDS spilling.
+ reserveRegisterTuples(Reserved, ScratchRSrcReg);
+ assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
+ }
+
return Reserved;
}
unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
unsigned Idx) const {
-
const AMDGPUSubtarget &STI = MF.getSubtarget<AMDGPUSubtarget>();
// FIXME: We should adjust the max number of waves based on LDS size.
unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(),
STI.getMaxWavesPerCU());
unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU());
+ unsigned VSLimit = SGPRLimit + VGPRLimit;
+
for (regclass_iterator I = regclass_begin(), E = regclass_end();
I != E; ++I) {
+ const TargetRegisterClass *RC = *I;
- unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1);
+ unsigned NumSubRegs = std::max((int)RC->getSize() / 4, 1);
unsigned Limit;
- if (isSGPRClass(*I)) {
+ if (isPseudoRegClass(RC)) {
+ // FIXME: This is a hack. We should never be considering the pressure of
+ // these since no virtual register should ever have this class.
+ Limit = VSLimit;
+ } else if (isSGPRClass(RC)) {
Limit = SGPRLimit / NumSubRegs;
} else {
Limit = VGPRLimit / NumSubRegs;
}
- const int *Sets = getRegClassPressureSets(*I);
+ const int *Sets = getRegClassPressureSets(RC);
assert(Sets);
for (unsigned i = 0; Sets[i] != -1; ++i) {
- if (Sets[i] == (int)Idx)
+ if (Sets[i] == (int)Idx)
return Limit;
}
}
@@ -174,17 +239,17 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
unsigned SubReg = NumSubRegs > 1 ?
getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
Value;
- bool IsKill = (i == e - 1);
BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
- .addReg(SubReg, getDefRegState(IsLoad))
- .addReg(ScratchRsrcReg, getKillRegState(IsKill))
- .addReg(SOffset)
- .addImm(Offset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .addReg(Value, RegState::Implicit | getDefRegState(IsLoad));
+ .addReg(SubReg, getDefRegState(IsLoad))
+ .addReg(ScratchRsrcReg)
+ .addReg(SOffset)
+ .addImm(Offset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addReg(Value, RegState::Implicit | getDefRegState(IsLoad))
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
}
}
@@ -228,6 +293,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
.addReg(SubReg)
.addImm(Spill.Lane);
+ // FIXME: Since this spills to another register instead of an actual
+ // frame index, we should delete the frame index when all references to
+ // it are fixed.
}
MI->eraseFromParent();
break;
@@ -263,16 +331,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// TODO: only do this when it is needed
switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) {
case AMDGPUSubtarget::SOUTHERN_ISLANDS:
- // "VALU writes SGPR" -> "SMRD reads that SGPR" needs "S_NOP 3" on SI
- TII->insertNOPs(MI, 3);
+ // "VALU writes SGPR" -> "SMRD reads that SGPR" needs 4 wait states
+ // ("S_NOP 3") on SI
+ TII->insertWaitStates(MI, 4);
break;
case AMDGPUSubtarget::SEA_ISLANDS:
break;
default: // VOLCANIC_ISLANDS and later
- // "VALU writes SGPR -> VMEM reads that SGPR" needs "S_NOP 4" on VI
- // and later. This also applies to VALUs which write VCC, but we're
- // unlikely to see VMEM use VCC.
- TII->insertNOPs(MI, 4);
+ // "VALU writes SGPR -> VMEM reads that SGPR" needs 5 wait states
+ // ("S_NOP 4") on VI and later. This also applies to VALUs which write
+ // VCC, but we're unlikely to see VMEM use VCC.
+ TII->insertWaitStates(MI, 5);
}
MI->eraseFromParent();
@@ -322,22 +391,16 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
}
-const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
- MVT VT) const {
- switch(VT.SimpleTy) {
- default:
- case MVT::i32: return &AMDGPU::VGPR_32RegClass;
- }
-}
-
unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
return getEncodingValue(Reg) & 0xff;
}
+// FIXME: This is very slow. It might be worth creating a map from physreg to
+// register class.
const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
assert(!TargetRegisterInfo::isVirtualRegister(Reg));
- static const TargetRegisterClass *BaseClasses[] = {
+ static const TargetRegisterClass *const BaseClasses[] = {
&AMDGPU::VGPR_32RegClass,
&AMDGPU::SReg_32RegClass,
&AMDGPU::VReg_64RegClass,
@@ -359,33 +422,45 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
return nullptr;
}
+// TODO: It might be helpful to have some target specific flags in
+// TargetRegisterClass to mark which classes are VGPRs to make this trivial.
bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
- return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) ||
- getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) ||
- getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) ||
- getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) ||
- getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) ||
- getCommonSubClass(&AMDGPU::VReg_512RegClass, RC);
+ switch (RC->getSize()) {
+ case 4:
+ return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
+ case 8:
+ return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr;
+ case 12:
+ return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
+ case 16:
+ return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
+ case 32:
+ return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
+ case 64:
+ return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
+ default:
+ llvm_unreachable("Invalid register class size");
+ }
}
const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
const TargetRegisterClass *SRC) const {
- if (hasVGPRs(SRC)) {
- return SRC;
- } else if (SRC == &AMDGPU::SCCRegRegClass) {
- return &AMDGPU::VCCRegRegClass;
- } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) {
- return &AMDGPU::VGPR_32RegClass;
- } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) {
- return &AMDGPU::VReg_64RegClass;
- } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) {
- return &AMDGPU::VReg_128RegClass;
- } else if (getCommonSubClass(SRC, &AMDGPU::SReg_256RegClass)) {
- return &AMDGPU::VReg_256RegClass;
- } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) {
- return &AMDGPU::VReg_512RegClass;
- }
- return nullptr;
+ switch (SRC->getSize()) {
+ case 4:
+ return &AMDGPU::VGPR_32RegClass;
+ case 8:
+ return &AMDGPU::VReg_64RegClass;
+ case 12:
+ return &AMDGPU::VReg_96RegClass;
+ case 16:
+ return &AMDGPU::VReg_128RegClass;
+ case 32:
+ return &AMDGPU::VReg_256RegClass;
+ case 64:
+ return &AMDGPU::VReg_512RegClass;
+ default:
+ llvm_unreachable("Invalid register class size");
+ }
}
const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
@@ -402,6 +477,30 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
}
}
+bool SIRegisterInfo::shouldRewriteCopySrc(
+ const TargetRegisterClass *DefRC,
+ unsigned DefSubReg,
+ const TargetRegisterClass *SrcRC,
+ unsigned SrcSubReg) const {
+ // We want to prefer the smallest register class possible, so we don't want to
+ // stop and rewrite on anything that looks like a subregister
+ // extract. Operations mostly don't care about the super register class, so we
+ // only want to stop on the most basic of copies between the smae register
+ // class.
+ //
+ // e.g. if we have something like
+ // vreg0 = ...
+ // vreg1 = ...
+ // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2
+ // vreg3 = COPY vreg2, sub0
+ //
+ // We want to look through the COPY to find:
+ // => vreg3 = COPY vreg0
+
+ // Plain copy.
+ return getCommonSubClass(DefRC, SrcRC) != nullptr;
+}
+
unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
const TargetRegisterClass *SubRC,
unsigned Channel) const {
@@ -462,30 +561,47 @@ bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
return OpType == AMDGPU::OPERAND_REG_INLINE_C;
}
+// FIXME: Most of these are flexible with HSA and we don't need to reserve them
+// as input registers if unused. Whether the dispatch ptr is necessary should be
+// easy to detect from used intrinsics. Scratch setup is harder to know.
unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
enum PreloadedValue Value) const {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+ (void)ST;
switch (Value) {
- case SIRegisterInfo::TGID_X:
- return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0);
- case SIRegisterInfo::TGID_Y:
- return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1);
- case SIRegisterInfo::TGID_Z:
- return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
- case SIRegisterInfo::SCRATCH_WAVE_OFFSET:
- if (MFI->getShaderType() != ShaderType::COMPUTE)
- return MFI->ScratchOffsetReg;
- return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
- case SIRegisterInfo::SCRATCH_PTR:
- return AMDGPU::SGPR2_SGPR3;
- case SIRegisterInfo::INPUT_PTR:
- return AMDGPU::SGPR0_SGPR1;
- case SIRegisterInfo::TIDIG_X:
+ case SIRegisterInfo::WORKGROUP_ID_X:
+ assert(MFI->hasWorkGroupIDX());
+ return MFI->WorkGroupIDXSystemSGPR;
+ case SIRegisterInfo::WORKGROUP_ID_Y:
+ assert(MFI->hasWorkGroupIDY());
+ return MFI->WorkGroupIDYSystemSGPR;
+ case SIRegisterInfo::WORKGROUP_ID_Z:
+ assert(MFI->hasWorkGroupIDZ());
+ return MFI->WorkGroupIDZSystemSGPR;
+ case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
+ return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
+ case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
+ assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations");
+ assert(MFI->hasPrivateSegmentBuffer());
+ return MFI->PrivateSegmentBufferUserSGPR;
+ case SIRegisterInfo::KERNARG_SEGMENT_PTR:
+ assert(MFI->hasKernargSegmentPtr());
+ return MFI->KernargSegmentPtrUserSGPR;
+ case SIRegisterInfo::DISPATCH_PTR:
+ assert(MFI->hasDispatchPtr());
+ return MFI->DispatchPtrUserSGPR;
+ case SIRegisterInfo::QUEUE_PTR:
+ llvm_unreachable("not implemented");
+ case SIRegisterInfo::WORKITEM_ID_X:
+ assert(MFI->hasWorkItemIDX());
return AMDGPU::VGPR0;
- case SIRegisterInfo::TIDIG_Y:
+ case SIRegisterInfo::WORKITEM_ID_Y:
+ assert(MFI->hasWorkItemIDY());
return AMDGPU::VGPR1;
- case SIRegisterInfo::TIDIG_Z:
+ case SIRegisterInfo::WORKITEM_ID_Z:
+ assert(MFI->hasWorkItemIDZ());
return AMDGPU::VGPR2;
}
llvm_unreachable("unexpected preloaded value type");
@@ -496,12 +612,9 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
// AMDGPU::NoRegister.
unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
const TargetRegisterClass *RC) const {
-
- for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
- I != E; ++I) {
- if (!MRI.isPhysRegUsed(*I))
- return *I;
- }
+ for (unsigned Reg : *RC)
+ if (!MRI.isPhysRegUsed(Reg))
+ return Reg;
return AMDGPU::NoRegister;
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 7da6de2..1795237 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -18,6 +18,7 @@
#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/Support/Debug.h"
namespace llvm {
@@ -29,6 +30,15 @@ private:
public:
SIRegisterInfo();
+ /// Return the end register initially reserved for the scratch buffer in case
+ /// spilling is needed.
+ unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
+
+ /// Return the end register initially reserved for the scratch wave offset in
+ /// case spilling is needed.
+ unsigned reservedPrivateSegmentWaveByteOffsetReg(
+ const MachineFunction &MF) const;
+
BitVector getReservedRegs(const MachineFunction &MF) const override;
unsigned getRegPressureSetLimit(const MachineFunction &MF,
@@ -40,10 +50,6 @@ public:
unsigned FIOperandNum,
RegScavenger *RS) const override;
- /// \brief get the register class of the specified type to use in the
- /// CFGStructurizer
- const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
-
unsigned getHWRegIndex(unsigned Reg) const override;
/// \brief Return the 'base' register class for this register.
@@ -52,23 +58,30 @@ public:
/// \returns true if this class contains only SGPR registers
bool isSGPRClass(const TargetRegisterClass *RC) const {
- if (!RC)
- return false;
-
return !hasVGPRs(RC);
}
/// \returns true if this class ID contains only SGPR registers
bool isSGPRClassID(unsigned RCID) const {
- if (static_cast<int>(RCID) == -1)
- return false;
-
return isSGPRClass(getRegClass(RCID));
}
+ bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
+ if (TargetRegisterInfo::isVirtualRegister(Reg))
+ return isSGPRClass(MRI.getRegClass(Reg));
+ return getPhysRegClass(Reg);
+ }
+
/// \returns true if this class contains VGPR registers.
bool hasVGPRs(const TargetRegisterClass *RC) const;
+ /// returns true if this is a pseudoregister class combination of VGPRs and
+ /// SGPRs for operand modeling. FIXME: We should set isAllocatable = 0 on
+ /// them.
+ static bool isPseudoRegClass(const TargetRegisterClass *RC) {
+ return RC == &AMDGPU::VS_32RegClass || RC == &AMDGPU::VS_64RegClass;
+ }
+
/// \returns A VGPR reg class with the same width as \p SRC
const TargetRegisterClass *getEquivalentVGPRClass(
const TargetRegisterClass *SRC) const;
@@ -79,6 +92,11 @@ public:
const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
unsigned SubIdx) const;
+ bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+ unsigned DefSubReg,
+ const TargetRegisterClass *SrcRC,
+ unsigned SrcSubReg) const override;
+
/// \p Channel This is the register channel (e.g. a value from 0-16), not the
/// SubReg index.
/// \returns The sub-register of Reg that is in Channel.
@@ -91,19 +109,25 @@ public:
/// \returns True if operands defined with this operand type can accept
/// an inline constant. i.e. An integer value in the range (-16, 64) or
- /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
+ /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
bool opCanUseInlineConstant(unsigned OpType) const;
enum PreloadedValue {
- TGID_X,
- TGID_Y,
- TGID_Z,
- SCRATCH_WAVE_OFFSET,
- SCRATCH_PTR,
- INPUT_PTR,
- TIDIG_X,
- TIDIG_Y,
- TIDIG_Z
+ // SGPRS:
+ PRIVATE_SEGMENT_BUFFER = 0,
+ DISPATCH_PTR = 1,
+ QUEUE_PTR = 2,
+ KERNARG_SEGMENT_PTR = 3,
+ WORKGROUP_ID_X = 10,
+ WORKGROUP_ID_Y = 11,
+ WORKGROUP_ID_Z = 12,
+ PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
+
+ // VGPRS:
+ FIRST_VGPR_VALUE = 15,
+ WORKITEM_ID_X = FIRST_VGPR_VALUE,
+ WORKITEM_ID_Y = 16,
+ WORKITEM_ID_Z = 17
};
/// \brief Returns the physical register that \p Value is stored in.
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 2a9017f..bfaf937 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -10,10 +10,13 @@
//===----------------------------------------------------------------------===//
// Declarations that describe the SI registers
//===----------------------------------------------------------------------===//
-
-class SIReg <string n, bits<16> encoding = 0> : Register<n> {
+class SIReg <string n, bits<16> regIdx = 0> : Register<n>,
+ DwarfRegNum<[!cast<int>(HWEncoding)]> {
let Namespace = "AMDGPU";
- let HWEncoding = encoding;
+
+ // This is the not yet the complete register encoding. An additional
+ // bit is set for VGPRs.
+ let HWEncoding = regIdx;
}
// Special Registers
@@ -21,7 +24,8 @@ def VCC_LO : SIReg<"vcc_lo", 106>;
def VCC_HI : SIReg<"vcc_hi", 107>;
// VCC for 64-bit instructions
-def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> {
+def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
+ DwarfRegAlias<VCC_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
let HWEncoding = 106;
@@ -30,7 +34,8 @@ def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> {
def EXEC_LO : SIReg<"exec_lo", 126>;
def EXEC_HI : SIReg<"exec_hi", 127>;
-def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> {
+def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>,
+ DwarfRegAlias<EXEC_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
let HWEncoding = 126;
@@ -39,18 +44,29 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> {
def SCC : SIReg<"scc", 253>;
def M0 : SIReg <"m0", 124>;
-def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes.
-def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes.
+multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
+ def _ci : SIReg<n, ci_e>;
+ def _vi : SIReg<n, vi_e>;
+ def "" : SIReg<"", 0>;
+}
-// Pair to indicate location of scratch space for flat accesses.
-def FLAT_SCR : RegisterWithSubRegs <"flat_scr", [FLAT_SCR_LO, FLAT_SCR_HI]> {
+class FlatReg <Register lo, Register hi, bits<16> encoding> :
+ RegisterWithSubRegs<"flat_scratch", [lo, hi]>,
+ DwarfRegAlias<lo> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
- let HWEncoding = 104;
+ let HWEncoding = encoding;
}
+defm FLAT_SCR_LO : FLAT_SCR_LOHI_m<"flat_scratch_lo", 104, 102>; // Offset in units of 256-bytes.
+defm FLAT_SCR_HI : FLAT_SCR_LOHI_m<"flat_scratch_hi", 105, 103>; // Size is the per-thread scratch size, in bytes.
+
+def FLAT_SCR_ci : FlatReg<FLAT_SCR_LO_ci, FLAT_SCR_HI_ci, 104>;
+def FLAT_SCR_vi : FlatReg<FLAT_SCR_LO_vi, FLAT_SCR_HI_vi, 102>;
+def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>;
+
// SGPR registers
-foreach Index = 0-101 in {
+foreach Index = 0-103 in {
def SGPR#Index : SIReg <"SGPR"#Index, Index>;
}
@@ -65,25 +81,27 @@ foreach Index = 0-255 in {
// Groupings using register classes and tuples
//===----------------------------------------------------------------------===//
+// TODO: Do we need to set DwarfRegAlias on register tuples?
+
// SGPR 32-bit registers
def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
- (add (sequence "SGPR%u", 0, 101))>;
+ (add (sequence "SGPR%u", 0, 103))>;
// SGPR 64-bit registers
def SGPR_64Regs : RegisterTuples<[sub0, sub1],
- [(add (decimate (trunc SGPR_32, 101), 2)),
+ [(add (decimate SGPR_32, 2)),
(add (decimate (shl SGPR_32, 1), 2))]>;
// SGPR 128-bit registers
def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
- [(add (decimate (trunc SGPR_32, 99), 4)),
+ [(add (decimate SGPR_32, 4)),
(add (decimate (shl SGPR_32, 1), 4)),
(add (decimate (shl SGPR_32, 2), 4)),
(add (decimate (shl SGPR_32, 3), 4))]>;
// SGPR 256-bit registers
def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
- [(add (decimate (trunc SGPR_32, 95), 4)),
+ [(add (decimate SGPR_32, 4)),
(add (decimate (shl SGPR_32, 1), 4)),
(add (decimate (shl SGPR_32, 2), 4)),
(add (decimate (shl SGPR_32, 3), 4)),
@@ -95,7 +113,7 @@ def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
// SGPR 512-bit registers
def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
- [(add (decimate (trunc SGPR_32, 87), 4)),
+ [(add (decimate SGPR_32, 4)),
(add (decimate (shl SGPR_32, 1), 4)),
(add (decimate (shl SGPR_32, 2), 4)),
(add (decimate (shl SGPR_32, 3), 4)),
@@ -174,44 +192,57 @@ class RegImmMatcher<string name> : AsmOperandClass {
let RenderMethod = "addRegOrImmOperands";
}
-// Special register classes for predicates and the M0 register
-def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)> {
- let CopyCost = -1; // Theoretically it is possible to read from SCC,
- // but it should never be necessary.
-}
-
-def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>;
-def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>;
-
// Register class for all scalar registers (SGPRs + Special Registers)
def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
(add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)
>;
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 64, (add SGPR_64Regs)>;
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)>;
-def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 64,
- (add SGPR_64, VCCReg, EXECReg, FLAT_SCR)
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
+ (add SGPR_64, VCC, EXEC, FLAT_SCR)
>;
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>;
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128)> {
+ // Requires 2 s_mov_b64 to copy
+ let CopyCost = 2;
+}
-def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>;
+def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add SGPR_256)> {
+ // Requires 4 s_mov_b64 to copy
+ let CopyCost = 4;
+}
-def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>;
+def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> {
+ // Requires 8 s_mov_b64 to copy
+ let CopyCost = 8;
+}
// Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>;
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> {
+ // Requires 2 v_mov_b32 to copy
+ let CopyCost = 2;
+}
-def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> {
+def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
let Size = 96;
+
+ // Requires 3 v_mov_b32 to copy
+ let CopyCost = 3;
}
-def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>;
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> {
+ // Requires 4 v_mov_b32 to copy
+ let CopyCost = 4;
+}
-def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>;
+def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add VGPR_256)> {
+ let CopyCost = 8;
+}
-def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
+def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> {
+ let CopyCost = 16;
+}
def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
let Size = 32;
@@ -253,7 +284,9 @@ def SCSrc_32 : RegInlineOperand<SReg_32> {
def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>;
-def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
+def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> {
+ let CopyCost = 2;
+}
def VSrc_32 : RegisterOperand<VS_32> {
let OperandNamespace = "AMDGPU";
@@ -282,3 +315,13 @@ def VCSrc_64 : RegisterOperand<VS_64> {
let OperandType = "OPERAND_REG_INLINE_C";
let ParserMatchClass = RegImmMatcher<"VCSrc64">;
}
+
+//===----------------------------------------------------------------------===//
+// SCSrc_* Operands with an SGPR or an inline constant
+//===----------------------------------------------------------------------===//
+
+def SCSrc_64 : RegisterOperand<SReg_64> {
+ let OperandNamespace = "AMDGPU";
+ let OperandType = "OPERAND_REG_INLINE_C";
+ let ParserMatchClass = RegImmMatcher<"SCSrc64">;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
index 9b1f676..cd77e51 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -17,16 +17,28 @@ def WriteLDS : SchedWrite;
def WriteSALU : SchedWrite;
def WriteSMEM : SchedWrite;
def WriteVMEM : SchedWrite;
+def WriteBarrier : SchedWrite;
// Vector ALU instructions
def Write32Bit : SchedWrite;
def WriteQuarterRate32 : SchedWrite;
+def WriteFullOrQuarterRate32 : SchedWrite;
def WriteFloatFMA : SchedWrite;
-def WriteDouble : SchedWrite;
+// Slow quarter rate f64 instruction.
+def WriteDouble : SchedWrite;
+
+// half rate f64 instruction (same as v_add_f64)
def WriteDoubleAdd : SchedWrite;
+// Half rate 64-bit instructions.
+def Write64Bit : SchedWrite;
+
+// FIXME: Should there be a class for instructions which are VALU
+// instructions and have VALU rates, but write to the SALU (i.e. VOPC
+// instructions)
+
def SIFullSpeedModel : SchedMachineModel;
def SIQuarterSpeedModel : SchedMachineModel;
@@ -53,7 +65,7 @@ class HWVALUWriteRes<SchedWrite write, int latency> :
// The latency numbers are taken from AMD Accelerated Parallel Processing
-// guide. They may not be acurate.
+// guide. They may not be accurate.
// The latency values are 1 / (operations / cycle) / 4.
multiclass SICommonWriteRes {
@@ -64,8 +76,10 @@ multiclass SICommonWriteRes {
def : HWWriteRes<WriteSALU, [HWSALU], 1>;
def : HWWriteRes<WriteSMEM, [HWLGKM], 10>; // XXX: Guessed ???
def : HWWriteRes<WriteVMEM, [HWVMEM], 450>; // 300 - 600
+ def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ???
def : HWVALUWriteRes<Write32Bit, 1>;
+ def : HWVALUWriteRes<Write64Bit, 2>;
def : HWVALUWriteRes<WriteQuarterRate32, 4>;
}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 5d00bdd..4f0913f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -141,8 +141,7 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
if (!MRI.isSSA())
return;
- assert(TII->isVOP1(MI.getOpcode()) || TII->isVOP2(MI.getOpcode()) ||
- TII->isVOPC(MI.getOpcode()));
+ assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
const SIRegisterInfo &TRI = TII->getRegisterInfo();
int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
@@ -187,6 +186,21 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
}
+// Copy MachineOperand with all flags except setting it as implicit.
+static MachineOperand copyRegOperandAsImplicit(const MachineOperand &Orig) {
+ assert(!Orig.isImplicit());
+ return MachineOperand::CreateReg(Orig.getReg(),
+ Orig.isDef(),
+ true,
+ Orig.isKill(),
+ Orig.isDead(),
+ Orig.isUndef(),
+ Orig.isEarlyClobber(),
+ Orig.getSubReg(),
+ Orig.isDebug(),
+ Orig.isInternalRead());
+}
+
bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
const SIInstrInfo *TII =
@@ -236,14 +250,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
if (TII->isVOPC(Op32)) {
unsigned DstReg = MI.getOperand(0).getReg();
if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
- // VOPC instructions can only write to the VCC register. We can't
- // force them to use VCC here, because the register allocator has
- // trouble with sequences like this, which cause the allocator to run
- // out of registers if vreg0 and vreg1 belong to the VCCReg register
- // class:
- // vreg0 = VOPC;
- // vreg1 = VOPC;
- // S_AND_B64 vreg0, vreg1
+ // VOPC instructions can only write to the VCC register. We can't
+ // force them to use VCC here, because this is only one register and
+ // cannot deal with sequences which would require multiple copies of
+ // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
//
// So, instead of forcing the instruction to write to VCC, we provide
// a hint to the register allocator to use VCC and then we we will run
@@ -272,13 +282,22 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
}
// We can shrink this instruction
- DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';);
+ DEBUG(dbgs() << "Shrinking " << MI);
MachineInstrBuilder Inst32 =
BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
- // dst
- Inst32.addOperand(MI.getOperand(0));
+ // Add the dst operand if the 32-bit encoding also has an explicit $dst.
+ // For VOPC instructions, this is replaced by an implicit def of vcc.
+ int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::dst);
+ if (Op32DstIdx != -1) {
+ // dst
+ Inst32.addOperand(MI.getOperand(0));
+ } else {
+ assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
+ "Unexpected case");
+ }
+
Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
@@ -288,9 +307,19 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
Inst32.addOperand(*Src1);
const MachineOperand *Src2 =
- TII->getNamedOperand(MI, AMDGPU::OpName::src2);
- if (Src2)
- Inst32.addOperand(*Src2);
+ TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+ if (Src2) {
+ int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
+ if (Op32Src2Idx != -1) {
+ Inst32.addOperand(*Src2);
+ } else {
+ // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
+ // replaced with an implicit read of vcc.
+ assert(Src2->getReg() == AMDGPU::VCC &&
+ "Unexpected missing register operand");
+ Inst32.addOperand(copyRegOperandAsImplicit(*Src2));
+ }
+ }
++NumInstructionsShrunk;
MI.eraseFromParent();
diff --git a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
index 591ce85..dbdc76b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
@@ -22,6 +22,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
@@ -61,14 +62,7 @@ bool SITypeRewriter::doInitialization(Module &M) {
}
bool SITypeRewriter::runOnFunction(Function &F) {
- Attribute A = F.getFnAttribute("ShaderType");
-
- unsigned ShaderType = ShaderType::COMPUTE;
- if (A.isStringAttribute()) {
- StringRef Str = A.getValueAsString();
- Str.getAsInteger(0, ShaderType);
- }
- if (ShaderType == ShaderType::COMPUTE)
+ if (AMDGPU::getShaderType(F) == ShaderType::COMPUTE)
return false;
visit(F);
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index b76b400..add415e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -7,12 +7,23 @@
//
//===----------------------------------------------------------------------===//
#include "AMDGPUBaseInfo.h"
+#include "AMDGPU.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/SubtargetFeature.h"
#define GET_SUBTARGETINFO_ENUM
#include "AMDGPUGenSubtargetInfo.inc"
#undef GET_SUBTARGETINFO_ENUM
+#define GET_REGINFO_ENUM
+#include "AMDGPUGenRegisterInfo.inc"
+#undef GET_REGINFO_ENUM
+
namespace llvm {
namespace AMDGPU {
@@ -56,5 +67,91 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
Header.private_segment_alignment = 4;
}
+MCSection *getHSATextSection(MCContext &Ctx) {
+ return Ctx.getELFSection(".hsatext", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_WRITE |
+ ELF::SHF_EXECINSTR |
+ ELF::SHF_AMDGPU_HSA_AGENT |
+ ELF::SHF_AMDGPU_HSA_CODE);
+}
+
+MCSection *getHSADataGlobalAgentSection(MCContext &Ctx) {
+ return Ctx.getELFSection(".hsadata_global_agent", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_WRITE |
+ ELF::SHF_AMDGPU_HSA_GLOBAL |
+ ELF::SHF_AMDGPU_HSA_AGENT);
+}
+
+MCSection *getHSADataGlobalProgramSection(MCContext &Ctx) {
+ return Ctx.getELFSection(".hsadata_global_program", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_WRITE |
+ ELF::SHF_AMDGPU_HSA_GLOBAL);
+}
+
+MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx) {
+ return Ctx.getELFSection(".hsarodata_readonly_agent", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_AMDGPU_HSA_READONLY |
+ ELF::SHF_AMDGPU_HSA_AGENT);
+}
+
+bool isGroupSegment(const GlobalValue *GV) {
+ return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}
+
+bool isGlobalSegment(const GlobalValue *GV) {
+ return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+}
+
+bool isReadOnlySegment(const GlobalValue *GV) {
+ return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+}
+
+static const char ShaderTypeAttribute[] = "ShaderType";
+
+unsigned getShaderType(const Function &F) {
+ Attribute A = F.getFnAttribute(ShaderTypeAttribute);
+ unsigned ShaderType = ShaderType::COMPUTE;
+
+ if (A.isStringAttribute()) {
+ StringRef Str = A.getValueAsString();
+ if (Str.getAsInteger(0, ShaderType)) {
+ LLVMContext &Ctx = F.getContext();
+ Ctx.emitError("can't parse shader type");
+ }
+ }
+ return ShaderType;
+}
+
+bool isSI(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands];
+}
+
+bool isCI(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureSeaIslands];
+}
+
+bool isVI(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
+}
+
+unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
+
+ switch(Reg) {
+ default: break;
+ case AMDGPU::FLAT_SCR:
+ assert(!isSI(STI));
+ return isCI(STI) ? AMDGPU::FLAT_SCR_ci : AMDGPU::FLAT_SCR_vi;
+
+ case AMDGPU::FLAT_SCR_LO:
+ assert(!isSI(STI));
+ return isCI(STI) ? AMDGPU::FLAT_SCR_LO_ci : AMDGPU::FLAT_SCR_LO_vi;
+
+ case AMDGPU::FLAT_SCR_HI:
+ assert(!isSI(STI));
+ return isCI(STI) ? AMDGPU::FLAT_SCR_HI_ci : AMDGPU::FLAT_SCR_HI_vi;
+ }
+ return Reg;
+}
+
} // End namespace AMDGPU
} // End namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f57028c..19419a2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -15,6 +15,11 @@
namespace llvm {
class FeatureBitset;
+class Function;
+class GlobalValue;
+class MCContext;
+class MCSection;
+class MCSubtargetInfo;
namespace AMDGPU {
@@ -27,6 +32,27 @@ struct IsaVersion {
IsaVersion getIsaVersion(const FeatureBitset &Features);
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
const FeatureBitset &Features);
+MCSection *getHSATextSection(MCContext &Ctx);
+
+MCSection *getHSADataGlobalAgentSection(MCContext &Ctx);
+
+MCSection *getHSADataGlobalProgramSection(MCContext &Ctx);
+
+MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx);
+
+bool isGroupSegment(const GlobalValue *GV);
+bool isGlobalSegment(const GlobalValue *GV);
+bool isReadOnlySegment(const GlobalValue *GV);
+
+unsigned getShaderType(const Function &F);
+
+bool isSI(const MCSubtargetInfo &STI);
+bool isCI(const MCSubtargetInfo &STI);
+bool isVI(const MCSubtargetInfo &STI);
+
+/// If \p Reg is a pseudo reg, return the correct hardware register given
+/// \p STI otherwise return \p Reg.
+unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
} // end namespace AMDGPU
} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td
index aca4673..20a026a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td
@@ -73,8 +73,8 @@ defm V_MIN_I16 : VOP2Inst <vop2<0,0x32>, "v_min_i16", VOP_I16_I16_I16>;
} // End isCommutable = 1
defm V_LDEXP_F16 : VOP2Inst <vop2<0,0x33>, "v_ldexp_f16", VOP_F16_F16_I16>;
-// Aliases to simplify matching of floating-pint instructions that are VOP2 on
-// SI and VOP3 on VI.
+// Aliases to simplify matching of floating-point instructions that
+// are VOP2 on SI and VOP3 on VI.
class SI2_VI3Alias <string name, Instruction inst> : InstAlias <
name#" $dst, $src0, $src1",
@@ -89,60 +89,15 @@ def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>;
def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>;
def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>;
-} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI
-
//===----------------------------------------------------------------------===//
-// SMEM Patterns
+// SMEM Instructions
//===----------------------------------------------------------------------===//
-let Predicates = [isVI] in {
+def S_DCACHE_WB : SMEM_Inval <0x21,
+ "s_dcache_wb", int_amdgcn_s_dcache_wb>;
-// 1. Offset as 20bit DWORD immediate
-def : Pat <
- (SIload_constant v4i32:$sbase, IMM20bit:$offset),
- (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
->;
-
-// Patterns for global loads with no offset
-class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
- (vt (node i64:$addr)),
- (inst $addr, 0, 0, 0)
->;
-
-def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>;
-
-class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
- (node vt:$data, i64:$addr),
- (inst $data, $addr, 0, 0, 0)
->;
-
-def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>;
-def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>;
-
-class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
- (vt (node i64:$addr, vt:$data)),
- (inst $addr, $data, 0, 0)
->;
-
-def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
+def S_DCACHE_WB_VOL : SMEM_Inval <0x23,
+ "s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>;
+} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI
-} // End Predicates = [isVI]
OpenPOWER on IntegriCloud