summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp300
1 files changed, 198 insertions, 102 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 9e972a5..99fe96c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -60,31 +60,35 @@ private:
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;
- LiveIntervals *LIS;
+ AliasAnalysis *AA;
static bool offsetsCanBeCombined(unsigned Offset0,
unsigned Offset1,
unsigned EltSize);
- MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I,
- unsigned EltSize);
+ MachineBasicBlock::iterator findMatchingDSInst(
+ MachineBasicBlock::iterator I,
+ unsigned EltSize,
+ SmallVectorImpl<MachineInstr*> &InstsToMove);
MachineBasicBlock::iterator mergeRead2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
- unsigned EltSize);
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove);
MachineBasicBlock::iterator mergeWrite2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
- unsigned EltSize);
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove);
public:
static char ID;
SILoadStoreOptimizer()
: MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr),
- LIS(nullptr) {}
+ AA(nullptr) {}
SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) {
initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
@@ -94,16 +98,11 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
- const char *getPassName() const override {
- return "SI Load / Store Optimizer";
- }
+ StringRef getPassName() const override { return "SI Load / Store Optimizer"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addPreserved<SlotIndexes>();
- AU.addPreserved<LiveIntervals>();
- AU.addPreserved<LiveVariables>();
- AU.addRequired<LiveIntervals>();
+ AU.addRequired<AAResultsWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -113,9 +112,7 @@ public:
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
"SI Load / Store Optimizer", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(LiveVariables)
-INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
"SI Load / Store Optimizer", false, false)
@@ -127,6 +124,73 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) {
return new SILoadStoreOptimizer(TM);
}
+static void moveInstsAfter(MachineBasicBlock::iterator I,
+ ArrayRef<MachineInstr*> InstsToMove) {
+ MachineBasicBlock *MBB = I->getParent();
+ ++I;
+ for (MachineInstr *MI : InstsToMove) {
+ MI->removeFromParent();
+ MBB->insert(I, MI);
+ }
+}
+
+static void addDefsToList(const MachineInstr &MI,
+ SmallVectorImpl<const MachineOperand *> &Defs) {
+ for (const MachineOperand &Def : MI.defs()) {
+ Defs.push_back(&Def);
+ }
+}
+
+static bool memAccessesCanBeReordered(
+ MachineBasicBlock::iterator A,
+ MachineBasicBlock::iterator B,
+ const SIInstrInfo *TII,
+ llvm::AliasAnalysis * AA) {
+ return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) ||
+ // RAW or WAR - cannot reorder
+ // WAW - cannot reorder
+ // RAR - safe to reorder
+ !(A->mayStore() || B->mayStore()));
+}
+
+// Add MI and its defs to the lists if MI reads one of the defs that are
+// already in the list. Returns true in that case.
+static bool
+addToListsIfDependent(MachineInstr &MI,
+ SmallVectorImpl<const MachineOperand *> &Defs,
+ SmallVectorImpl<MachineInstr*> &Insts) {
+ for (const MachineOperand *Def : Defs) {
+ bool ReadDef = MI.readsVirtualRegister(Def->getReg());
+ // If ReadDef is true, then there is a use of Def between I
+ // and the instruction that I will potentially be merged with. We
+ // will need to move this instruction after the merged instructions.
+ if (ReadDef) {
+ Insts.push_back(&MI);
+ addDefsToList(MI, Defs);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool
+canMoveInstsAcrossMemOp(MachineInstr &MemOp,
+ ArrayRef<MachineInstr*> InstsToMove,
+ const SIInstrInfo *TII,
+ AliasAnalysis *AA) {
+
+ assert(MemOp.mayLoadOrStore());
+
+ for (MachineInstr *InstToMove : InstsToMove) {
+ if (!InstToMove->mayLoadOrStore())
+ continue;
+ if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
+ return false;
+ }
+ return true;
+}
+
bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
unsigned Offset1,
unsigned Size) {
@@ -156,43 +220,99 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
MachineBasicBlock::iterator
SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
- unsigned EltSize){
+ unsigned EltSize,
+ SmallVectorImpl<MachineInstr*> &InstsToMove) {
MachineBasicBlock::iterator E = I->getParent()->end();
MachineBasicBlock::iterator MBBI = I;
++MBBI;
- if (MBBI->getOpcode() != I->getOpcode())
- return E;
-
- // Don't merge volatiles.
- if (MBBI->hasOrderedMemoryRef())
- return E;
-
- int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
- const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
- const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
-
- // Check same base pointer. Be careful of subregisters, which can occur with
- // vectors of pointers.
- if (AddrReg0.getReg() == AddrReg1.getReg() &&
- AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
- int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
- AMDGPU::OpName::offset);
- unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
- unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
-
- // Check both offsets fit in the reduced range.
- if (offsetsCanBeCombined(Offset0, Offset1, EltSize))
- return MBBI;
- }
+ SmallVector<const MachineOperand *, 8> DefsToMove;
+ addDefsToList(*I, DefsToMove);
+ for ( ; MBBI != E; ++MBBI) {
+
+ if (MBBI->getOpcode() != I->getOpcode()) {
+
+ // This is not a matching DS instruction, but we can keep looking as
+ // long as one of these conditions are met:
+ // 1. It is safe to move I down past MBBI.
+ // 2. It is safe to move MBBI down past the instruction that I will
+ // be merged into.
+
+ if (MBBI->hasUnmodeledSideEffects())
+ // We can't re-order this instruction with respect to other memory
+ // opeations, so we fail both conditions mentioned above.
+ return E;
+
+ if (MBBI->mayLoadOrStore() &&
+ !memAccessesCanBeReordered(*I, *MBBI, TII, AA)) {
+ // We fail condition #1, but we may still be able to satisfy condition
+ // #2. Add this instruction to the move list and then we will check
+ // if condition #2 holds once we have selected the matching instruction.
+ InstsToMove.push_back(&*MBBI);
+ addDefsToList(*MBBI, DefsToMove);
+ continue;
+ }
+
+ // When we match I with another DS instruction we will be moving I down
+ // to the location of the matched instruction any uses of I will need to
+ // be moved down as well.
+ addToListsIfDependent(*MBBI, DefsToMove, InstsToMove);
+ continue;
+ }
+
+ // Don't merge volatiles.
+ if (MBBI->hasOrderedMemoryRef())
+ return E;
+
+ // Handle a case like
+ // DS_WRITE_B32 addr, v, idx0
+ // w = DS_READ_B32 addr, idx0
+ // DS_WRITE_B32 addr, f(w), idx1
+ // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
+ // merging of the two writes.
+ if (addToListsIfDependent(*MBBI, DefsToMove, InstsToMove))
+ continue;
+
+ int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
+ const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
+ const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
+
+ // Check same base pointer. Be careful of subregisters, which can occur with
+ // vectors of pointers.
+ if (AddrReg0.getReg() == AddrReg1.getReg() &&
+ AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
+ int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
+ AMDGPU::OpName::offset);
+ unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
+ unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
+
+ // Check both offsets fit in the reduced range.
+ // We also need to go through the list of instructions that we plan to
+ // move and make sure they are all safe to move down past the merged
+ // instruction.
+ if (offsetsCanBeCombined(Offset0, Offset1, EltSize) &&
+ canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA))
+ return MBBI;
+ }
+
+ // We've found a load/store that we couldn't merge for some reason.
+ // We could potentially keep looking, but we'd need to make sure that
+ // it was safe to move I and also all the instruction in InstsToMove
+ // down past this instruction.
+ if (!memAccessesCanBeReordered(*I, *MBBI, TII, AA) || // check if we can move I across MBBI
+ !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA) // check if we can move all I's users
+ )
+ break;
+ }
return E;
}
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
- unsigned EltSize) {
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove) {
MachineBasicBlock *MBB = I->getParent();
// Be careful, since the addresses could be subregisters themselves in weird
@@ -220,6 +340,15 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
}
+ unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
+ unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
+
+ if (NewOffset0 > NewOffset1) {
+ // Canonicalize the merged instruction so the smaller offset comes first.
+ std::swap(NewOffset0, NewOffset1);
+ std::swap(SubRegIdx0, SubRegIdx1);
+ }
+
assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
(NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
@@ -232,62 +361,40 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
DebugLoc DL = I->getDebugLoc();
MachineInstrBuilder Read2
- = BuildMI(*MBB, I, DL, Read2Desc, DestReg)
+ = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg)
.addOperand(*AddrReg) // addr
.addImm(NewOffset0) // offset0
.addImm(NewOffset1) // offset1
.addImm(0) // gds
.addMemOperand(*I->memoperands_begin())
.addMemOperand(*Paired->memoperands_begin());
-
- unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
- unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
+ (void)Read2;
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
// Copy to the old destination registers.
- MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc)
+ BuildMI(*MBB, Paired, DL, CopyDesc)
.addOperand(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
- MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc)
+ MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc)
.addOperand(*Dest1)
.addReg(DestReg, RegState::Kill, SubRegIdx1);
- LIS->InsertMachineInstrInMaps(*Read2);
-
- // repairLiveintervalsInRange() doesn't handle physical register, so we have
- // to update the M0 range manually.
- SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
- LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
- LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
- bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
-
- // The new write to the original destination register is now the copy. Steal
- // the old SlotIndex.
- LIS->ReplaceMachineInstrInMaps(*I, *Copy0);
- LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1);
+ moveInstsAfter(Copy1, InstsToMove);
+ MachineBasicBlock::iterator Next = std::next(I);
I->eraseFromParent();
Paired->eraseFromParent();
- LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
- LIS->shrinkToUses(&AddrRegLI);
-
- LIS->createAndComputeVirtRegInterval(DestReg);
-
- if (UpdateM0Range) {
- SlotIndex Read2Index = LIS->getInstructionIndex(*Read2);
- M0Segment->end = Read2Index.getRegSlot();
- }
-
DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
- return Read2.getInstr();
+ return Next;
}
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
- unsigned EltSize) {
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove) {
MachineBasicBlock *MBB = I->getParent();
// Be sure to use .addOperand(), and not .addReg() with these. We want to be
@@ -316,6 +423,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
}
+ if (NewOffset0 > NewOffset1) {
+ // Canonicalize the merged instruction so the smaller offset comes first.
+ std::swap(NewOffset0, NewOffset1);
+ std::swap(Data0, Data1);
+ }
+
assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
(NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
@@ -323,15 +436,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
const MCInstrDesc &Write2Desc = TII->get(Opc);
DebugLoc DL = I->getDebugLoc();
- // repairLiveintervalsInRange() doesn't handle physical register, so we have
- // to update the M0 range manually.
- SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
- LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
- LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
- bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
-
MachineInstrBuilder Write2
- = BuildMI(*MBB, I, DL, Write2Desc)
+ = BuildMI(*MBB, Paired, DL, Write2Desc)
.addOperand(*Addr) // addr
.addOperand(*Data0) // data0
.addOperand(*Data1) // data1
@@ -341,24 +447,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
.addMemOperand(*I->memoperands_begin())
.addMemOperand(*Paired->memoperands_begin());
- // XXX - How do we express subregisters here?
- unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
+ moveInstsAfter(Write2, InstsToMove);
- LIS->RemoveMachineInstrFromMaps(*I);
- LIS->RemoveMachineInstrFromMaps(*Paired);
+ MachineBasicBlock::iterator Next = std::next(I);
I->eraseFromParent();
Paired->eraseFromParent();
- // This doesn't handle physical registers like M0
- LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs);
-
- if (UpdateM0Range) {
- SlotIndex Write2Index = LIS->getInstructionIndex(*Write2);
- M0Segment->end = Write2Index.getRegSlot();
- }
-
DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
- return Write2.getInstr();
+ return Next;
}
// Scan through looking for adjacent LDS operations with constant offsets from
@@ -376,13 +472,15 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
continue;
}
+ SmallVector<MachineInstr*, 8> InstsToMove;
unsigned Opc = MI.getOpcode();
if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
- MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
+ MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
+ InstsToMove);
if (Match != E) {
Modified = true;
- I = mergeRead2Pair(I, Match, Size);
+ I = mergeRead2Pair(I, Match, Size, InstsToMove);
} else {
++I;
}
@@ -390,10 +488,11 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
continue;
} else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
- MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
+ MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
+ InstsToMove);
if (Match != E) {
Modified = true;
- I = mergeWrite2Pair(I, Match, Size);
+ I = mergeWrite2Pair(I, Match, Size, InstsToMove);
} else {
++I;
}
@@ -419,13 +518,10 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
-
- LIS = &getAnalysis<LiveIntervals>();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
- assert(!MRI->isSSA());
-
bool Modified = false;
for (MachineBasicBlock &MBB : MF)
OpenPOWER on IntegriCloud