diff options
Diffstat (limited to 'contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp | 350 |
1 files changed, 170 insertions, 180 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp index 66ae9c2..0bb5f99 100644 --- a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp +++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -14,9 +14,9 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "x86-vzeroupper" #include "X86.h" #include "X86InstrInfo.h" +#include "X86Subtarget.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -27,76 +27,64 @@ #include "llvm/Target/TargetInstrInfo.h" using namespace llvm; +#define DEBUG_TYPE "x86-vzeroupper" + STATISTIC(NumVZU, "Number of vzeroupper instructions inserted"); namespace { - struct VZeroUpperInserter : public MachineFunctionPass { - static char ID; - VZeroUpperInserter() : MachineFunctionPass(ID) {} - - virtual bool runOnMachineFunction(MachineFunction &MF); - bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB); + class VZeroUpperInserter : public MachineFunctionPass { + public: - virtual const char *getPassName() const { return "X86 vzeroupper inserter";} + VZeroUpperInserter() : MachineFunctionPass(ID) {} + bool runOnMachineFunction(MachineFunction &MF) override; + const char *getPassName() const override {return "X86 vzeroupper inserter";} private: - const TargetInstrInfo *TII; // Machine instruction info. - - // Any YMM register live-in to this function? - bool FnHasLiveInYmm; - - // BBState - Contains the state of each MBB: unknown, clean, dirty - SmallVector<uint8_t, 8> BBState; - // BBSolved - Keep track of all MBB which had been already analyzed - // and there is no further processing required. - BitVector BBSolved; - - // Machine Basic Blocks are classified according this pass: - // - // ST_UNKNOWN - The MBB state is unknown, meaning from the entry state - // until the MBB exit there isn't a instruction using YMM to change - // the state to dirty, or one of the incoming predecessors is unknown - // and there's not a dirty predecessor between them. + void processBasicBlock(MachineBasicBlock &MBB); + void insertVZeroUpper(MachineBasicBlock::iterator I, + MachineBasicBlock &MBB); + void addDirtySuccessor(MachineBasicBlock &MBB); + + typedef enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY } BlockExitState; + static const char* getBlockExitStateName(BlockExitState ST); + + // Core algorithm state: + // BlockState - Each block is either: + // - PASS_THROUGH: There are neither YMM dirtying instructions nor + // vzeroupper instructions in this block. + // - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this + // block that will ensure that YMM is clean on exit. + // - EXITS_DIRTY: An instruction in the block dirties YMM and no + // subsequent vzeroupper in the block clears it. // - // ST_CLEAN - No YMM usage in the end of the MBB. A MBB could have - // instructions using YMM and be marked ST_CLEAN, as long as the state - // is cleaned by a vzeroupper before any call. + // AddedToDirtySuccessors - This flag is raised when a block is added to the + // DirtySuccessors list to ensure that it's not + // added multiple times. // - // ST_DIRTY - Any MBB ending with a YMM usage not cleaned up by a - // vzeroupper instruction. - // - // ST_INIT - Placeholder for an empty state set - // - enum { - ST_UNKNOWN = 0, - ST_CLEAN = 1, - ST_DIRTY = 2, - ST_INIT = 3 + // FirstUnguardedCall - Records the location of the first unguarded call in + // each basic block that may need to be guarded by a + // vzeroupper. We won't know whether it actually needs + // to be guarded until we discover a predecessor that + // is DIRTY_OUT. + struct BlockState { + BlockState() : ExitState(PASS_THROUGH), AddedToDirtySuccessors(false) {} + BlockExitState ExitState; + bool AddedToDirtySuccessors; + MachineBasicBlock::iterator FirstUnguardedCall; }; + typedef SmallVector<BlockState, 8> BlockStateMap; + typedef SmallVector<MachineBasicBlock*, 8> DirtySuccessorsWorkList; - // computeState - Given two states, compute the resulting state, in - // the following way - // - // 1) One dirty state yields another dirty state - // 2) All states must be clean for the result to be clean - // 3) If none above and one unknown, the result state is also unknown - // - static unsigned computeState(unsigned PrevState, unsigned CurState) { - if (PrevState == ST_INIT) - return CurState; - - if (PrevState == ST_DIRTY || CurState == ST_DIRTY) - return ST_DIRTY; - - if (PrevState == ST_CLEAN && CurState == ST_CLEAN) - return ST_CLEAN; - - return ST_UNKNOWN; - } + BlockStateMap BlockStates; + DirtySuccessorsWorkList DirtySuccessors; + bool EverMadeChange; + const TargetInstrInfo *TII; + static char ID; }; + char VZeroUpperInserter::ID = 0; } @@ -104,29 +92,30 @@ FunctionPass *llvm::createX86IssueVZeroUpperPass() { return new VZeroUpperInserter(); } -static bool isYmmReg(unsigned Reg) { - return (Reg >= X86::YMM0 && Reg <= X86::YMM31); +const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) { + switch (ST) { + case PASS_THROUGH: return "Pass-through"; + case EXITS_DIRTY: return "Exits-dirty"; + case EXITS_CLEAN: return "Exits-clean"; + } + llvm_unreachable("Invalid block exit state."); } -static bool isZmmReg(unsigned Reg) { - return (Reg >= X86::ZMM0 && Reg <= X86::ZMM31); +static bool isYmmReg(unsigned Reg) { + return (Reg >= X86::YMM0 && Reg <= X86::YMM15); } static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) - if (isYmmReg(I->first) || isZmmReg(I->first)) + if (isYmmReg(I->first)) return true; return false; } static bool clobbersAllYmmRegs(const MachineOperand &MO) { - for (unsigned reg = X86::YMM0; reg <= X86::YMM31; ++reg) { - if (!MO.clobbersPhysReg(reg)) - return false; - } - for (unsigned reg = X86::ZMM0; reg <= X86::ZMM31; ++reg) { + for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { if (!MO.clobbersPhysReg(reg)) return false; } @@ -150,16 +139,13 @@ static bool hasYmmReg(MachineInstr *MI) { /// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this /// instruction. -static bool clobbersAnyYmmReg(MachineInstr *MI) { +static bool callClobbersAnyYmmReg(MachineInstr *MI) { + assert(MI->isCall() && "Can only be called on call instructions."); for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI->getOperand(i); if (!MO.isRegMask()) continue; - for (unsigned reg = X86::YMM0; reg <= X86::YMM31; ++reg) { - if (MO.clobbersPhysReg(reg)) - return true; - } - for (unsigned reg = X86::ZMM0; reg <= X86::ZMM31; ++reg) { + for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { if (MO.clobbersPhysReg(reg)) return true; } @@ -167,102 +153,44 @@ static bool clobbersAnyYmmReg(MachineInstr *MI) { return false; } -/// runOnMachineFunction - Loop over all of the basic blocks, inserting -/// vzero upper instructions before function calls. -bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { - TII = MF.getTarget().getInstrInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - bool EverMadeChange = false; - - // Fast check: if the function doesn't use any ymm registers, we don't need - // to insert any VZEROUPPER instructions. This is constant-time, so it is - // cheap in the common case of no ymm use. - bool YMMUsed = false; - const TargetRegisterClass *RC = &X86::VR256RegClass; - for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); - i != e; i++) { - if (!MRI.reg_nodbg_empty(*i)) { - YMMUsed = true; - break; - } - } - if (!YMMUsed) - return EverMadeChange; - - // Pre-compute the existence of any live-in YMM registers to this function - FnHasLiveInYmm = checkFnHasLiveInYmm(MRI); - - assert(BBState.empty()); - BBState.resize(MF.getNumBlockIDs(), 0); - BBSolved.resize(MF.getNumBlockIDs(), 0); - - // Each BB state depends on all predecessors, loop over until everything - // converges. (Once we converge, we can implicitly mark everything that is - // still ST_UNKNOWN as ST_CLEAN.) - while (1) { - bool MadeChange = false; - - // Process all basic blocks. - for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) - MadeChange |= processBasicBlock(MF, *I); +// Insert a vzeroupper instruction before I. +void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I, + MachineBasicBlock &MBB) { + DebugLoc dl = I->getDebugLoc(); + BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER)); + ++NumVZU; + EverMadeChange = true; +} - // If this iteration over the code changed anything, keep iterating. - if (!MadeChange) break; - EverMadeChange = true; +// Add MBB to the DirtySuccessors list if it hasn't already been added. +void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) { + if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) { + DirtySuccessors.push_back(&MBB); + BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true; } - - BBState.clear(); - BBSolved.clear(); - return EverMadeChange; } /// processBasicBlock - Loop over all of the instructions in the basic block, /// inserting vzero upper instructions before function calls. -bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, - MachineBasicBlock &BB) { - bool Changed = false; - unsigned BBNum = BB.getNumber(); - - // Don't process already solved BBs - if (BBSolved[BBNum]) - return false; // No changes - - // Check the state of all predecessors - unsigned EntryState = ST_INIT; - for (MachineBasicBlock::const_pred_iterator PI = BB.pred_begin(), - PE = BB.pred_end(); PI != PE; ++PI) { - EntryState = computeState(EntryState, BBState[(*PI)->getNumber()]); - if (EntryState == ST_DIRTY) - break; - } - +void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { - // The entry MBB for the function may set the initial state to dirty if - // the function receives any YMM incoming arguments - if (&BB == MF.begin()) { - EntryState = ST_CLEAN; - if (FnHasLiveInYmm) - EntryState = ST_DIRTY; - } - - // The current state is initialized according to the predecessors - unsigned CurState = EntryState; - bool BBHasCall = false; + // Start by assuming that the block PASS_THROUGH, which implies no unguarded + // calls. + BlockExitState CurState = PASS_THROUGH; + BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end(); - for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { - DebugLoc dl = I->getDebugLoc(); + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { MachineInstr *MI = I; - bool isControlFlow = MI->isCall() || MI->isReturn(); // Shortcut: don't need to check regular instructions in dirty state. - if (!isControlFlow && CurState == ST_DIRTY) + if (!isControlFlow && CurState == EXITS_DIRTY) continue; if (hasYmmReg(MI)) { // We found a ymm-using instruction; this could be an AVX instruction, // or it could be control flow. - CurState = ST_DIRTY; + CurState = EXITS_DIRTY; continue; } @@ -276,11 +204,9 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, // standard calling convention is not used (RegMask is not used to mark // register clobbered and register usage (def/imp-def/use) is well-dfined // and explicitly specified. - if (MI->isCall() && !clobbersAnyYmmReg(MI)) + if (MI->isCall() && !callClobbersAnyYmmReg(MI)) continue; - BBHasCall = true; - // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX // registers. This instruction has zero latency. In addition, the processor // changes back to Clean state, after which execution of Intel SSE @@ -289,38 +215,102 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, // execute SSE code. // FIXME: In some cases, we may want to move the VZEROUPPER into a // predecessor block. - if (CurState == ST_DIRTY) { - // Only insert the VZEROUPPER in case the entry state isn't unknown. - // When unknown, only compute the information within the block to have - // it available in the exit if possible, but don't change the block. - if (EntryState != ST_UNKNOWN) { - BuildMI(BB, I, dl, TII->get(X86::VZEROUPPER)); - ++NumVZU; - } - + if (CurState == EXITS_DIRTY) { // After the inserted VZEROUPPER the state becomes clean again, but // other YMM may appear before other subsequent calls or even before // the end of the BB. - CurState = ST_CLEAN; + insertVZeroUpper(I, MBB); + CurState = EXITS_CLEAN; + } else if (CurState == PASS_THROUGH) { + // If this block is currently in pass-through state and we encounter a + // call then whether we need a vzeroupper or not depends on whether this + // block has successors that exit dirty. Record the location of the call, + // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet. + // It will be inserted later if necessary. + BlockStates[MBB.getNumber()].FirstUnguardedCall = I; + CurState = EXITS_CLEAN; } } - DEBUG(dbgs() << "MBB #" << BBNum - << ", current state: " << CurState << '\n'); + DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: " + << getBlockExitStateName(CurState) << '\n'); + + if (CurState == EXITS_DIRTY) + for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(), + SE = MBB.succ_end(); + SI != SE; ++SI) + addDirtySuccessor(**SI); + + BlockStates[MBB.getNumber()].ExitState = CurState; +} + +/// runOnMachineFunction - Loop over all of the basic blocks, inserting +/// vzero upper instructions before function calls. +bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { + const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>(); + if (!ST.hasAVX() || ST.hasAVX512()) + return false; + TII = MF.getTarget().getInstrInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + EverMadeChange = false; - // A BB can only be considered solved when we both have done all the - // necessary transformations, and have computed the exit state. This happens - // in two cases: - // 1) We know the entry state: this immediately implies the exit state and - // all the necessary transformations. - // 2) There are no calls, and and a non-call instruction marks this block: - // no transformations are necessary, and we know the exit state. - if (EntryState != ST_UNKNOWN || (!BBHasCall && CurState != ST_UNKNOWN)) - BBSolved[BBNum] = true; + // Fast check: if the function doesn't use any ymm registers, we don't need + // to insert any VZEROUPPER instructions. This is constant-time, so it is + // cheap in the common case of no ymm use. + bool YMMUsed = false; + const TargetRegisterClass *RC = &X86::VR256RegClass; + for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); + i != e; i++) { + if (!MRI.reg_nodbg_empty(*i)) { + YMMUsed = true; + break; + } + } + if (!YMMUsed) { + return false; + } - if (CurState != BBState[BBNum]) - Changed = true; + assert(BlockStates.empty() && DirtySuccessors.empty() && + "X86VZeroUpper state should be clear"); + BlockStates.resize(MF.getNumBlockIDs()); + + // Process all blocks. This will compute block exit states, record the first + // unguarded call in each block, and add successors of dirty blocks to the + // DirtySuccessors list. + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) + processBasicBlock(*I); + + // If any YMM regs are live in to this function, add the entry block to the + // DirtySuccessors list + if (checkFnHasLiveInYmm(MRI)) + addDirtySuccessor(MF.front()); + + // Re-visit all blocks that are successors of EXITS_DIRTY bsocks. Add + // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY + // through PASS_THROUGH blocks. + while (!DirtySuccessors.empty()) { + MachineBasicBlock &MBB = *DirtySuccessors.back(); + DirtySuccessors.pop_back(); + BlockState &BBState = BlockStates[MBB.getNumber()]; + + // MBB is a successor of a dirty block, so its first call needs to be + // guarded. + if (BBState.FirstUnguardedCall != MBB.end()) + insertVZeroUpper(BBState.FirstUnguardedCall, MBB); + + // If this successor was a pass-through block then it is now dirty, and its + // successors need to be added to the worklist (if they haven't been + // already). + if (BBState.ExitState == PASS_THROUGH) { + DEBUG(dbgs() << "MBB #" << MBB.getNumber() + << " was Pass-through, is now Dirty-out.\n"); + for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(), + SE = MBB.succ_end(); + SI != SE; ++SI) + addDirtySuccessor(**SI); + } + } - BBState[BBNum] = CurState; - return Changed; + BlockStates.clear(); + return EverMadeChange; } |