diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 300 |
1 files changed, 198 insertions, 102 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 9e972a5..99fe96c 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -60,31 +60,35 @@ private: const SIInstrInfo *TII; const SIRegisterInfo *TRI; MachineRegisterInfo *MRI; - LiveIntervals *LIS; + AliasAnalysis *AA; static bool offsetsCanBeCombined(unsigned Offset0, unsigned Offset1, unsigned EltSize); - MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I, - unsigned EltSize); + MachineBasicBlock::iterator findMatchingDSInst( + MachineBasicBlock::iterator I, + unsigned EltSize, + SmallVectorImpl<MachineInstr*> &InstsToMove); MachineBasicBlock::iterator mergeRead2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - unsigned EltSize); + unsigned EltSize, + ArrayRef<MachineInstr*> InstsToMove); MachineBasicBlock::iterator mergeWrite2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - unsigned EltSize); + unsigned EltSize, + ArrayRef<MachineInstr*> InstsToMove); public: static char ID; SILoadStoreOptimizer() : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr), - LIS(nullptr) {} + AA(nullptr) {} SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); @@ -94,16 +98,11 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const override { - return "SI Load / Store Optimizer"; - } + StringRef getPassName() const override { return "SI Load / Store Optimizer"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addPreserved<SlotIndexes>(); - AU.addPreserved<LiveIntervals>(); - AU.addPreserved<LiveVariables>(); - AU.addRequired<LiveIntervals>(); + AU.addRequired<AAResultsWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -113,9 +112,7 @@ public: INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load / Store Optimizer", false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_DEPENDENCY(LiveVariables) -INITIALIZE_PASS_DEPENDENCY(SlotIndexes) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load / Store Optimizer", false, false) @@ -127,6 +124,73 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) { return new SILoadStoreOptimizer(TM); } +static void moveInstsAfter(MachineBasicBlock::iterator I, + ArrayRef<MachineInstr*> InstsToMove) { + MachineBasicBlock *MBB = I->getParent(); + ++I; + for (MachineInstr *MI : InstsToMove) { + MI->removeFromParent(); + MBB->insert(I, MI); + } +} + +static void addDefsToList(const MachineInstr &MI, + SmallVectorImpl<const MachineOperand *> &Defs) { + for (const MachineOperand &Def : MI.defs()) { + Defs.push_back(&Def); + } +} + +static bool memAccessesCanBeReordered( + MachineBasicBlock::iterator A, + MachineBasicBlock::iterator B, + const SIInstrInfo *TII, + llvm::AliasAnalysis * AA) { + return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) || + // RAW or WAR - cannot reorder + // WAW - cannot reorder + // RAR - safe to reorder + !(A->mayStore() || B->mayStore())); +} + +// Add MI and its defs to the lists if MI reads one of the defs that are +// already in the list. Returns true in that case. +static bool +addToListsIfDependent(MachineInstr &MI, + SmallVectorImpl<const MachineOperand *> &Defs, + SmallVectorImpl<MachineInstr*> &Insts) { + for (const MachineOperand *Def : Defs) { + bool ReadDef = MI.readsVirtualRegister(Def->getReg()); + // If ReadDef is true, then there is a use of Def between I + // and the instruction that I will potentially be merged with. We + // will need to move this instruction after the merged instructions. + if (ReadDef) { + Insts.push_back(&MI); + addDefsToList(MI, Defs); + return true; + } + } + + return false; +} + +static bool +canMoveInstsAcrossMemOp(MachineInstr &MemOp, + ArrayRef<MachineInstr*> InstsToMove, + const SIInstrInfo *TII, + AliasAnalysis *AA) { + + assert(MemOp.mayLoadOrStore()); + + for (MachineInstr *InstToMove : InstsToMove) { + if (!InstToMove->mayLoadOrStore()) + continue; + if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) + return false; + } + return true; +} + bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, unsigned Offset1, unsigned Size) { @@ -156,43 +220,99 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, MachineBasicBlock::iterator SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, - unsigned EltSize){ + unsigned EltSize, + SmallVectorImpl<MachineInstr*> &InstsToMove) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineBasicBlock::iterator MBBI = I; ++MBBI; - if (MBBI->getOpcode() != I->getOpcode()) - return E; - - // Don't merge volatiles. - if (MBBI->hasOrderedMemoryRef()) - return E; - - int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); - const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); - const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); - - // Check same base pointer. Be careful of subregisters, which can occur with - // vectors of pointers. - if (AddrReg0.getReg() == AddrReg1.getReg() && - AddrReg0.getSubReg() == AddrReg1.getSubReg()) { - int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), - AMDGPU::OpName::offset); - unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; - unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; - - // Check both offsets fit in the reduced range. - if (offsetsCanBeCombined(Offset0, Offset1, EltSize)) - return MBBI; - } + SmallVector<const MachineOperand *, 8> DefsToMove; + addDefsToList(*I, DefsToMove); + for ( ; MBBI != E; ++MBBI) { + + if (MBBI->getOpcode() != I->getOpcode()) { + + // This is not a matching DS instruction, but we can keep looking as + // long as one of these conditions are met: + // 1. It is safe to move I down past MBBI. + // 2. It is safe to move MBBI down past the instruction that I will + // be merged into. + + if (MBBI->hasUnmodeledSideEffects()) + // We can't re-order this instruction with respect to other memory + // opeations, so we fail both conditions mentioned above. + return E; + + if (MBBI->mayLoadOrStore() && + !memAccessesCanBeReordered(*I, *MBBI, TII, AA)) { + // We fail condition #1, but we may still be able to satisfy condition + // #2. Add this instruction to the move list and then we will check + // if condition #2 holds once we have selected the matching instruction. + InstsToMove.push_back(&*MBBI); + addDefsToList(*MBBI, DefsToMove); + continue; + } + + // When we match I with another DS instruction we will be moving I down + // to the location of the matched instruction any uses of I will need to + // be moved down as well. + addToListsIfDependent(*MBBI, DefsToMove, InstsToMove); + continue; + } + + // Don't merge volatiles. + if (MBBI->hasOrderedMemoryRef()) + return E; + + // Handle a case like + // DS_WRITE_B32 addr, v, idx0 + // w = DS_READ_B32 addr, idx0 + // DS_WRITE_B32 addr, f(w), idx1 + // where the DS_READ_B32 ends up in InstsToMove and therefore prevents + // merging of the two writes. + if (addToListsIfDependent(*MBBI, DefsToMove, InstsToMove)) + continue; + + int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); + const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); + const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); + + // Check same base pointer. Be careful of subregisters, which can occur with + // vectors of pointers. + if (AddrReg0.getReg() == AddrReg1.getReg() && + AddrReg0.getSubReg() == AddrReg1.getSubReg()) { + int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), + AMDGPU::OpName::offset); + unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; + unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; + + // Check both offsets fit in the reduced range. + // We also need to go through the list of instructions that we plan to + // move and make sure they are all safe to move down past the merged + // instruction. + if (offsetsCanBeCombined(Offset0, Offset1, EltSize) && + canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA)) + return MBBI; + } + + // We've found a load/store that we couldn't merge for some reason. + // We could potentially keep looking, but we'd need to make sure that + // it was safe to move I and also all the instruction in InstsToMove + // down past this instruction. + if (!memAccessesCanBeReordered(*I, *MBBI, TII, AA) || // check if we can move I across MBBI + !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA) // check if we can move all I's users + ) + break; + } return E; } MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - unsigned EltSize) { + unsigned EltSize, + ArrayRef<MachineInstr*> InstsToMove) { MachineBasicBlock *MBB = I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird @@ -220,6 +340,15 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; } + unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; + unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; + + if (NewOffset0 > NewOffset1) { + // Canonicalize the merged instruction so the smaller offset comes first. + std::swap(NewOffset0, NewOffset1); + std::swap(SubRegIdx0, SubRegIdx1); + } + assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); @@ -232,62 +361,40 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( DebugLoc DL = I->getDebugLoc(); MachineInstrBuilder Read2 - = BuildMI(*MBB, I, DL, Read2Desc, DestReg) + = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg) .addOperand(*AddrReg) // addr .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 .addImm(0) // gds .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); - - unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; - unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; + (void)Read2; const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); // Copy to the old destination registers. - MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc) + BuildMI(*MBB, Paired, DL, CopyDesc) .addOperand(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc) + MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc) .addOperand(*Dest1) .addReg(DestReg, RegState::Kill, SubRegIdx1); - LIS->InsertMachineInstrInMaps(*Read2); - - // repairLiveintervalsInRange() doesn't handle physical register, so we have - // to update the M0 range manually. - SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired); - LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); - LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); - bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); - - // The new write to the original destination register is now the copy. Steal - // the old SlotIndex. - LIS->ReplaceMachineInstrInMaps(*I, *Copy0); - LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1); + moveInstsAfter(Copy1, InstsToMove); + MachineBasicBlock::iterator Next = std::next(I); I->eraseFromParent(); Paired->eraseFromParent(); - LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); - LIS->shrinkToUses(&AddrRegLI); - - LIS->createAndComputeVirtRegInterval(DestReg); - - if (UpdateM0Range) { - SlotIndex Read2Index = LIS->getInstructionIndex(*Read2); - M0Segment->end = Read2Index.getRegSlot(); - } - DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); - return Read2.getInstr(); + return Next; } MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - unsigned EltSize) { + unsigned EltSize, + ArrayRef<MachineInstr*> InstsToMove) { MachineBasicBlock *MBB = I->getParent(); // Be sure to use .addOperand(), and not .addReg() with these. We want to be @@ -316,6 +423,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; } + if (NewOffset0 > NewOffset1) { + // Canonicalize the merged instruction so the smaller offset comes first. + std::swap(NewOffset0, NewOffset1); + std::swap(Data0, Data1); + } + assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); @@ -323,15 +436,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( const MCInstrDesc &Write2Desc = TII->get(Opc); DebugLoc DL = I->getDebugLoc(); - // repairLiveintervalsInRange() doesn't handle physical register, so we have - // to update the M0 range manually. - SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired); - LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); - LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); - bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); - MachineInstrBuilder Write2 - = BuildMI(*MBB, I, DL, Write2Desc) + = BuildMI(*MBB, Paired, DL, Write2Desc) .addOperand(*Addr) // addr .addOperand(*Data0) // data0 .addOperand(*Data1) // data1 @@ -341,24 +447,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); - // XXX - How do we express subregisters here? - unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() }; + moveInstsAfter(Write2, InstsToMove); - LIS->RemoveMachineInstrFromMaps(*I); - LIS->RemoveMachineInstrFromMaps(*Paired); + MachineBasicBlock::iterator Next = std::next(I); I->eraseFromParent(); Paired->eraseFromParent(); - // This doesn't handle physical registers like M0 - LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs); - - if (UpdateM0Range) { - SlotIndex Write2Index = LIS->getInstructionIndex(*Write2); - M0Segment->end = Write2Index.getRegSlot(); - } - DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); - return Write2.getInstr(); + return Next; } // Scan through looking for adjacent LDS operations with constant offsets from @@ -376,13 +472,15 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { continue; } + SmallVector<MachineInstr*, 8> InstsToMove; unsigned Opc = MI.getOpcode(); if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; - MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); + MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, + InstsToMove); if (Match != E) { Modified = true; - I = mergeRead2Pair(I, Match, Size); + I = mergeRead2Pair(I, Match, Size, InstsToMove); } else { ++I; } @@ -390,10 +488,11 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { continue; } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; - MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); + MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, + InstsToMove); if (Match != E) { Modified = true; - I = mergeWrite2Pair(I, Match, Size); + I = mergeWrite2Pair(I, Match, Size, InstsToMove); } else { ++I; } @@ -419,13 +518,10 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); - - LIS = &getAnalysis<LiveIntervals>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); - assert(!MRI->isSSA()); - bool Modified = false; for (MachineBasicBlock &MBB : MF) |