summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target
diff options
context:
space:
mode:
authordim <dim@FreeBSD.org>2015-02-02 20:34:40 +0000
committerdim <dim@FreeBSD.org>2015-02-02 20:34:40 +0000
commitfe14cf7eedbcc7ccc867ba5116c3553718d22ecf (patch)
tree9c796001015fe17ab4322360b2c8a2ba835b9d76 /contrib/llvm/lib/Target
parent8dc847406524920db5b69acab7d3f942c5ea65d3 (diff)
downloadFreeBSD-src-fe14cf7eedbcc7ccc867ba5116c3553718d22ecf.zip
FreeBSD-src-fe14cf7eedbcc7ccc867ba5116c3553718d22ecf.tar.gz
Pull in r227752 from upstream llvm trunk (by Michael Kuperstein):
[X86] Convert esp-relative movs of function arguments to pushes, step 2 This moves the transformation introduced in r223757 into a separate MI pass. This allows it to cover many more cases (not only cases where there must be a reserved call frame), and perform rudimentary call folding. It still doesn't have a heuristic, so it is enabled only for optsize/minsize, with stack alignment <= 8, where it ought to be a fairly clear win. (Re-commit of r227728) Differential Revision: http://reviews.llvm.org/D6789 This helps to get sys/boot/i386/boot2 below the required size again, when optimizing with -Oz.
Diffstat (limited to 'contrib/llvm/lib/Target')
-rw-r--r--contrib/llvm/lib/Target/X86/X86.h5
-rw-r--r--contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp400
-rw-r--r--contrib/llvm/lib/Target/X86/X86FastISel.cpp2
-rw-r--r--contrib/llvm/lib/Target/X86/X86FrameLowering.cpp167
-rw-r--r--contrib/llvm/lib/Target/X86/X86FrameLowering.h2
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrCompiler.td14
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrInfo.cpp52
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrInfo.h5
-rw-r--r--contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h12
-rw-r--r--contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp5
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetMachine.cpp5
11 files changed, 534 insertions, 135 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h
index 8bd5817..219b64d 100644
--- a/contrib/llvm/lib/Target/X86/X86.h
+++ b/contrib/llvm/lib/Target/X86/X86.h
@@ -67,6 +67,11 @@ FunctionPass *createX86PadShortFunctions();
/// to eliminate execution delays in some Atom processors.
FunctionPass *createX86FixupLEAs();
+/// createX86CallFrameOptimization - Return a pass that optimizes
+/// the code-size of x86 call sequences. This is done by replacing
+/// esp-relative movs with pushes.
+FunctionPass *createX86CallFrameOptimization();
+
} // End llvm namespace
#endif
diff --git a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
new file mode 100644
index 0000000..fae489e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -0,0 +1,400 @@
+//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that optimizes call sequences on x86.
+// Currently, it converts movs of function parameters onto the stack into
+// pushes. This is beneficial for two main reasons:
+// 1) The push instruction encoding is much smaller than an esp-relative mov
+// 2) It is possible to push memory arguments directly. So, if the
+// the transformation is preformed pre-reg-alloc, it can help relieve
+// register pressure.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "X86MachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-cf-opt"
+
+cl::opt<bool> NoX86CFOpt("no-x86-call-frame-opt",
+ cl::desc("Avoid optimizing x86 call frames for size"),
+ cl::init(false), cl::Hidden);
+
+namespace {
+class X86CallFrameOptimization : public MachineFunctionPass {
+public:
+ X86CallFrameOptimization() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ bool shouldPerformTransformation(MachineFunction &MF);
+
+ bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I);
+
+ MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
+ unsigned Reg);
+
+ const char *getPassName() const override {
+ return "X86 Optimize Call Frame";
+ }
+
+ const TargetInstrInfo *TII;
+ const TargetFrameLowering *TFL;
+ const MachineRegisterInfo *MRI;
+ static char ID;
+};
+
+char X86CallFrameOptimization::ID = 0;
+}
+
+FunctionPass *llvm::createX86CallFrameOptimization() {
+ return new X86CallFrameOptimization();
+}
+
+// This checks whether the transformation is legal and profitable
+bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) {
+ if (NoX86CFOpt.getValue())
+ return false;
+
+ // We currently only support call sequences where *all* parameters.
+ // are passed on the stack.
+ // No point in running this in 64-bit mode, since some arguments are
+ // passed in-register in all common calling conventions, so the pattern
+ // we're looking for will never match.
+ const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
+ if (STI.is64Bit())
+ return false;
+
+ // You would expect straight-line code between call-frame setup and
+ // call-frame destroy. You would be wrong. There are circumstances (e.g.
+ // CMOV_GR8 expansion of a select that feeds a function call!) where we can
+ // end up with the setup and the destroy in different basic blocks.
+ // This is bad, and breaks SP adjustment.
+ // So, check that all of the frames in the function are closed inside
+ // the same block, and, for good measure, that there are no nested frames.
+ int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+ int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+ for (MachineBasicBlock &BB : MF) {
+ bool InsideFrameSequence = false;
+ for (MachineInstr &MI : BB) {
+ if (MI.getOpcode() == FrameSetupOpcode) {
+ if (InsideFrameSequence)
+ return false;
+ InsideFrameSequence = true;
+ }
+ else if (MI.getOpcode() == FrameDestroyOpcode) {
+ if (!InsideFrameSequence)
+ return false;
+ InsideFrameSequence = false;
+ }
+ }
+
+ if (InsideFrameSequence)
+ return false;
+ }
+
+ // Now that we know the transformation is legal, check if it is
+ // profitable.
+ // TODO: Add a heuristic that actually looks at the function,
+ // and enable this for more cases.
+
+ // This transformation is always a win when we expected to have
+ // a reserved call frame. Under other circumstances, it may be either
+ // a win or a loss, and requires a heuristic.
+ // For now, enable it only for the relatively clear win cases.
+ bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects();
+ if (CannotReserveFrame)
+ return true;
+
+ // For now, don't even try to evaluate the profitability when
+ // not optimizing for size.
+ AttributeSet FnAttrs = MF.getFunction()->getAttributes();
+ bool OptForSize =
+ FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::OptimizeForSize) ||
+ FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+
+ if (!OptForSize)
+ return false;
+
+ // Stack re-alignment can make this unprofitable even in terms of size.
+ // As mentioned above, a better heuristic is needed. For now, don't do this
+ // when the required alignment is above 8. (4 would be the safe choice, but
+ // some experimentation showed 8 is generally good).
+ if (TFL->getStackAlignment() > 8)
+ return false;
+
+ return true;
+}
+
+bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
+ TII = MF.getSubtarget().getInstrInfo();
+ TFL = MF.getSubtarget().getFrameLowering();
+ MRI = &MF.getRegInfo();
+
+ if (!shouldPerformTransformation(MF))
+ return false;
+
+ int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+
+ bool Changed = false;
+
+ for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+ for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
+ if (I->getOpcode() == FrameSetupOpcode)
+ Changed |= adjustCallSequence(MF, *BB, I);
+
+ return Changed;
+}
+
+bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) {
+
+ // Check that this particular call sequence is amenable to the
+ // transformation.
+ const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
+ MF.getSubtarget().getRegisterInfo());
+ unsigned StackPtr = RegInfo.getStackRegister();
+ int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+
+ // We expect to enter this at the beginning of a call sequence
+ assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
+ MachineBasicBlock::iterator FrameSetup = I++;
+
+
+ // For globals in PIC mode, we can have some LEAs here.
+ // Ignore them, they don't bother us.
+ // TODO: Extend this to something that covers more cases.
+ while (I->getOpcode() == X86::LEA32r)
+ ++I;
+
+ // We expect a copy instruction here.
+ // TODO: The copy instruction is a lowering artifact.
+ // We should also support a copy-less version, where the stack
+ // pointer is used directly.
+ if (!I->isCopy() || !I->getOperand(0).isReg())
+ return false;
+ MachineBasicBlock::iterator SPCopy = I++;
+ StackPtr = SPCopy->getOperand(0).getReg();
+
+ // Scan the call setup sequence for the pattern we're looking for.
+ // We only handle a simple case - a sequence of MOV32mi or MOV32mr
+ // instructions, that push a sequence of 32-bit values onto the stack, with
+ // no gaps between them.
+ SmallVector<MachineInstr*, 4> MovVector(4, nullptr);
+ unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;
+ if (MaxAdjust > 4)
+ MovVector.resize(MaxAdjust, nullptr);
+
+ do {
+ int Opcode = I->getOpcode();
+ if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
+ break;
+
+ // We only want movs of the form:
+ // movl imm/r32, k(%esp)
+ // If we run into something else, bail.
+ // Note that AddrBaseReg may, counter to its name, not be a register,
+ // but rather a frame index.
+ // TODO: Support the fi case. This should probably work now that we
+ // have the infrastructure to track the stack pointer within a call
+ // sequence.
+ if (!I->getOperand(X86::AddrBaseReg).isReg() ||
+ (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
+ !I->getOperand(X86::AddrScaleAmt).isImm() ||
+ (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
+ (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
+ (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
+ !I->getOperand(X86::AddrDisp).isImm())
+ return false;
+
+ int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
+ assert(StackDisp >= 0 && "Negative stack displacement when passing parameters");
+
+ // We really don't want to consider the unaligned case.
+ if (StackDisp % 4)
+ return false;
+ StackDisp /= 4;
+
+ assert((size_t)StackDisp < MovVector.size() &&
+ "Function call has more parameters than the stack is adjusted for.");
+
+ // If the same stack slot is being filled twice, something's fishy.
+ if (MovVector[StackDisp] != nullptr)
+ return false;
+ MovVector[StackDisp] = I;
+
+ ++I;
+ } while (I != MBB.end());
+
+ // We now expect the end of the sequence - a call and a stack adjust.
+ if (I == MBB.end())
+ return false;
+
+ // For PCrel calls, we expect an additional COPY of the basereg.
+ // If we find one, skip it.
+ if (I->isCopy()) {
+ if (I->getOperand(1).getReg() ==
+ MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg())
+ ++I;
+ else
+ return false;
+ }
+
+ if (!I->isCall())
+ return false;
+ MachineBasicBlock::iterator Call = I;
+ if ((++I)->getOpcode() != FrameDestroyOpcode)
+ return false;
+
+ // Now, go through the vector, and see that we don't have any gaps,
+ // but only a series of 32-bit MOVs.
+
+ int64_t ExpectedDist = 0;
+ auto MMI = MovVector.begin(), MME = MovVector.end();
+ for (; MMI != MME; ++MMI, ExpectedDist += 4)
+ if (*MMI == nullptr)
+ break;
+
+ // If the call had no parameters, do nothing
+ if (!ExpectedDist)
+ return false;
+
+ // We are either at the last parameter, or a gap.
+ // Make sure it's not a gap
+ for (; MMI != MME; ++MMI)
+ if (*MMI != nullptr)
+ return false;
+
+ // Ok, we can in fact do the transformation for this call.
+ // Do not remove the FrameSetup instruction, but adjust the parameters.
+ // PEI will end up finalizing the handling of this.
+ FrameSetup->getOperand(1).setImm(ExpectedDist);
+
+ DebugLoc DL = I->getDebugLoc();
+ // Now, iterate through the vector in reverse order, and replace the movs
+ // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to
+ // replace uses.
+ for (int Idx = (ExpectedDist / 4) - 1; Idx >= 0; --Idx) {
+ MachineBasicBlock::iterator MOV = *MovVector[Idx];
+ MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
+ if (MOV->getOpcode() == X86::MOV32mi) {
+ unsigned PushOpcode = X86::PUSHi32;
+ // If the operand is a small (8-bit) immediate, we can use a
+ // PUSH instruction with a shorter encoding.
+ // Note that isImm() may fail even though this is a MOVmi, because
+ // the operand can also be a symbol.
+ if (PushOp.isImm()) {
+ int64_t Val = PushOp.getImm();
+ if (isInt<8>(Val))
+ PushOpcode = X86::PUSH32i8;
+ }
+ BuildMI(MBB, Call, DL, TII->get(PushOpcode)).addOperand(PushOp);
+ } else {
+ unsigned int Reg = PushOp.getReg();
+
+ // If PUSHrmm is not slow on this target, try to fold the source of the
+ // push into the instruction.
+ const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>();
+ bool SlowPUSHrmm = ST.isAtom() || ST.isSLM();
+
+ // Check that this is legal to fold. Right now, we're extremely
+ // conservative about that.
+ MachineInstr *DefMov = nullptr;
+ if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
+ MachineInstr *Push = BuildMI(MBB, Call, DL, TII->get(X86::PUSH32rmm));
+
+ unsigned NumOps = DefMov->getDesc().getNumOperands();
+ for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
+ Push->addOperand(DefMov->getOperand(i));
+
+ DefMov->eraseFromParent();
+ } else {
+ BuildMI(MBB, Call, DL, TII->get(X86::PUSH32r)).addReg(Reg).getInstr();
+ }
+ }
+
+ MBB.erase(MOV);
+ }
+
+ // The stack-pointer copy is no longer used in the call sequences.
+ // There should not be any other users, but we can't commit to that, so:
+ if (MRI->use_empty(SPCopy->getOperand(0).getReg()))
+ SPCopy->eraseFromParent();
+
+ // Once we've done this, we need to make sure PEI doesn't assume a reserved
+ // frame.
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ FuncInfo->setHasPushSequences(true);
+
+ return true;
+}
+
+MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
+ MachineBasicBlock::iterator FrameSetup, unsigned Reg) {
+ // Do an extremely restricted form of load folding.
+ // ISel will often create patterns like:
+ // movl 4(%edi), %eax
+ // movl 8(%edi), %ecx
+ // movl 12(%edi), %edx
+ // movl %edx, 8(%esp)
+ // movl %ecx, 4(%esp)
+ // movl %eax, (%esp)
+ // call
+ // Get rid of those with prejudice.
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ return nullptr;
+
+ // Make sure this is the only use of Reg.
+ if (!MRI->hasOneNonDBGUse(Reg))
+ return nullptr;
+
+ MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg);
+
+ // Make sure the def is a MOV from memory.
+ // If the def is an another block, give up.
+ if (DefMI->getOpcode() != X86::MOV32rm ||
+ DefMI->getParent() != FrameSetup->getParent())
+ return nullptr;
+
+ // Be careful with movs that load from a stack slot, since it may get
+ // resolved incorrectly.
+ // TODO: Again, we already have the infrastructure, so this should work.
+ if (!DefMI->getOperand(1).isReg())
+ return nullptr;
+
+ // Now, make sure everything else up until the ADJCALLSTACK is a sequence
+ // of MOVs. To be less conservative would require duplicating a lot of the
+ // logic from PeepholeOptimizer.
+ // FIXME: A possibly better approach would be to teach the PeepholeOptimizer
+ // to be smarter about folding into pushes.
+ for (auto I = DefMI; I != FrameSetup; ++I)
+ if (I->getOpcode() != X86::MOV32rm)
+ return nullptr;
+
+ return DefMI;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
index 5d71eac..688a544 100644
--- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
@@ -2735,7 +2735,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
// Issue CALLSEQ_START
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
- .addImm(NumBytes);
+ .addImm(NumBytes).addImm(0);
// Walk the register/memloc assignments, inserting copies/loads.
const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
index 16aab16..f2eb6a8 100644
--- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -38,7 +38,34 @@ using namespace llvm;
extern cl::opt<bool> ForceStackAlign;
bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
- return !MF.getFrameInfo()->hasVarSizedObjects();
+ return !MF.getFrameInfo()->hasVarSizedObjects() &&
+ !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
+}
+
+/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
+/// call frame pseudos can be simplified. Having a FP, as in the default
+/// implementation, is not sufficient here since we can't always use it.
+/// Use a more nuanced condition.
+bool
+X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
+ const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>
+ (MF.getSubtarget().getRegisterInfo());
+ return hasReservedCallFrame(MF) ||
+ (hasFP(MF) && !TRI->needsStackRealignment(MF))
+ || TRI->hasBasePointer(MF);
+}
+
+// needsFrameIndexResolution - Do we need to perform FI resolution for
+// this function. Normally, this is required only when the function
+// has any stack objects. However, FI resolution actually has another job,
+// not apparent from the title - it resolves callframesetup/destroy
+// that were not simplified earlier.
+// So, this is required for x86 functions that have push sequences even
+// when there are no stack objects.
+bool
+X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
+ return MF.getFrameInfo()->hasStackObjects() ||
+ MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
}
/// hasFP - Return true if the specified function should have a dedicated frame
@@ -93,16 +120,6 @@ static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
return X86::AND32ri;
}
-static unsigned getPUSHiOpcode(bool IsLP64, MachineOperand MO) {
- // We don't support LP64 for now.
- assert(!IsLP64);
-
- if (MO.isImm() && isInt<8>(MO.getImm()))
- return X86::PUSH32i8;
-
- return X86::PUSHi32;;
-}
-
static unsigned getLEArOpcode(unsigned IsLP64) {
return IsLP64 ? X86::LEA64r : X86::LEA32r;
}
@@ -1848,100 +1865,6 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
#endif
}
-bool X86FrameLowering::
-convertArgMovsToPushes(MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I, uint64_t Amount) const {
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
- MF.getSubtarget().getRegisterInfo());
- unsigned StackPtr = RegInfo.getStackRegister();
-
- // Scan the call setup sequence for the pattern we're looking for.
- // We only handle a simple case now - a sequence of MOV32mi or MOV32mr
- // instructions, that push a sequence of 32-bit values onto the stack, with
- // no gaps.
- std::map<int64_t, MachineBasicBlock::iterator> MovMap;
- do {
- int Opcode = I->getOpcode();
- if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
- break;
-
- // We only want movs of the form:
- // movl imm/r32, k(%ecx)
- // If we run into something else, bail
- // Note that AddrBaseReg may, counterintuitively, not be a register...
- if (!I->getOperand(X86::AddrBaseReg).isReg() ||
- (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
- !I->getOperand(X86::AddrScaleAmt).isImm() ||
- (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
- (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
- (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
- !I->getOperand(X86::AddrDisp).isImm())
- return false;
-
- int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
-
- // We don't want to consider the unaligned case.
- if (StackDisp % 4)
- return false;
-
- // If the same stack slot is being filled twice, something's fishy.
- if (!MovMap.insert(std::pair<int64_t, MachineInstr*>(StackDisp, I)).second)
- return false;
-
- ++I;
- } while (I != MBB.end());
-
- // We now expect the end of the sequence - a call and a stack adjust.
- if (I == MBB.end())
- return false;
- if (!I->isCall())
- return false;
- MachineBasicBlock::iterator Call = I;
- if ((++I)->getOpcode() != TII.getCallFrameDestroyOpcode())
- return false;
-
- // Now, go through the map, and see that we don't have any gaps,
- // but only a series of 32-bit MOVs.
- // Since std::map provides ordered iteration, the original order
- // of the MOVs doesn't matter.
- int64_t ExpectedDist = 0;
- for (auto MMI = MovMap.begin(), MME = MovMap.end(); MMI != MME;
- ++MMI, ExpectedDist += 4)
- if (MMI->first != ExpectedDist)
- return false;
-
- // Ok, everything looks fine. Do the transformation.
- DebugLoc DL = I->getDebugLoc();
-
- // It's possible the original stack adjustment amount was larger than
- // that done by the pushes. If so, we still need a SUB.
- Amount -= ExpectedDist;
- if (Amount) {
- MachineInstr* Sub = BuildMI(MBB, Call, DL,
- TII.get(getSUBriOpcode(false, Amount)), StackPtr)
- .addReg(StackPtr).addImm(Amount);
- Sub->getOperand(3).setIsDead();
- }
-
- // Now, iterate through the map in reverse order, and replace the movs
- // with pushes. MOVmi/MOVmr doesn't have any defs, so need to replace uses.
- for (auto MMI = MovMap.rbegin(), MME = MovMap.rend(); MMI != MME; ++MMI) {
- MachineBasicBlock::iterator MOV = MMI->second;
- MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
-
- // Replace MOVmr with PUSH32r, and MOVmi with PUSHi of appropriate size
- int PushOpcode = X86::PUSH32r;
- if (MOV->getOpcode() == X86::MOV32mi)
- PushOpcode = getPUSHiOpcode(false, PushOp);
-
- BuildMI(MBB, Call, DL, TII.get(PushOpcode)).addOperand(PushOp);
- MBB.erase(MOV);
- }
-
- return true;
-}
-
void X86FrameLowering::
eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
@@ -1956,7 +1879,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
bool IsLP64 = STI.isTarget64BitLP64();
DebugLoc DL = I->getDebugLoc();
uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
- uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0;
+ uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0;
I = MBB.erase(I);
if (!reserveCallFrame) {
@@ -1976,24 +1899,18 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
MachineInstr *New = nullptr;
- if (Opcode == TII.getCallFrameSetupOpcode()) {
- // Try to convert movs to the stack into pushes.
- // We currently only look for a pattern that appears in 32-bit
- // calling conventions.
- if (!IsLP64 && convertArgMovsToPushes(MF, MBB, I, Amount))
- return;
-
- New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)),
- StackPtr)
- .addReg(StackPtr)
- .addImm(Amount);
- } else {
- assert(Opcode == TII.getCallFrameDestroyOpcode());
- // Factor out the amount the callee already popped.
- Amount -= CalleeAmt;
+ // Factor out the amount that gets handled inside the sequence
+ // (Pushes of argument for frame setup, callee pops for frame destroy)
+ Amount -= InternalAmt;
+
+ if (Amount) {
+ if (Opcode == TII.getCallFrameSetupOpcode()) {
+ New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), StackPtr)
+ .addReg(StackPtr).addImm(Amount);
+ } else {
+ assert(Opcode == TII.getCallFrameDestroyOpcode());
- if (Amount) {
unsigned Opc = getADDriOpcode(IsLP64, Amount);
New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
.addReg(StackPtr).addImm(Amount);
@@ -2011,13 +1928,13 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
return;
}
- if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) {
+ if (Opcode == TII.getCallFrameDestroyOpcode() && InternalAmt) {
// If we are performing frame pointer elimination and if the callee pops
// something off the stack pointer, add it back. We do this until we have
// more advanced stack pointer tracking ability.
- unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt);
+ unsigned Opc = getSUBriOpcode(IsLP64, InternalAmt);
MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
- .addReg(StackPtr).addImm(CalleeAmt);
+ .addReg(StackPtr).addImm(InternalAmt);
// The EFLAGS implicit def is dead.
New->getOperand(3).setIsDead();
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
index ee0ee22..9cb887a 100644
--- a/contrib/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
@@ -64,6 +64,8 @@ public:
bool hasFP(const MachineFunction &MF) const override;
bool hasReservedCallFrame(const MachineFunction &MF) const override;
+ bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
+ bool needsFrameIndexResolution(const MachineFunction &MF) const override;
int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
int getFrameIndexReference(const MachineFunction &MF, int FI,
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
index ed0a634..880c982 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -43,15 +43,18 @@ let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in
// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
// sub / add which can clobber EFLAGS.
let Defs = [ESP, EFLAGS], Uses = [ESP] in {
-def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
"#ADJCALLSTACKDOWN",
- [(X86callseq_start timm:$amt)]>,
+ []>,
Requires<[NotLP64]>;
def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
"#ADJCALLSTACKUP",
[(X86callseq_end timm:$amt1, timm:$amt2)]>,
Requires<[NotLP64]>;
}
+def : Pat<(X86callseq_start timm:$amt1),
+ (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>;
+
// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
// a stack adjustment and the codegen must know that they may modify the stack
@@ -59,16 +62,17 @@ def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
// sub / add which can clobber EFLAGS.
let Defs = [RSP, EFLAGS], Uses = [RSP] in {
-def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
"#ADJCALLSTACKDOWN",
- [(X86callseq_start timm:$amt)]>,
+ []>,
Requires<[IsLP64]>;
def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
"#ADJCALLSTACKUP",
[(X86callseq_end timm:$amt1, timm:$amt2)]>,
Requires<[IsLP64]>;
}
-
+def : Pat<(X86callseq_start timm:$amt1),
+ (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>;
// x86-64 va_start lowering magic.
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
index 4615693..6b6b8ae 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -1692,6 +1692,58 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
return false;
}
+int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const {
+ const MachineFunction *MF = MI->getParent()->getParent();
+ const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+
+ if (MI->getOpcode() == getCallFrameSetupOpcode() ||
+ MI->getOpcode() == getCallFrameDestroyOpcode()) {
+ unsigned StackAlign = TFI->getStackAlignment();
+ int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign *
+ StackAlign;
+
+ SPAdj -= MI->getOperand(1).getImm();
+
+ if (MI->getOpcode() == getCallFrameSetupOpcode())
+ return SPAdj;
+ else
+ return -SPAdj;
+ }
+
+ // To know whether a call adjusts the stack, we need information
+ // that is bound to the following ADJCALLSTACKUP pseudo.
+ // Look for the next ADJCALLSTACKUP that follows the call.
+ if (MI->isCall()) {
+ const MachineBasicBlock* MBB = MI->getParent();
+ auto I = ++MachineBasicBlock::const_iterator(MI);
+ for (auto E = MBB->end(); I != E; ++I) {
+ if (I->getOpcode() == getCallFrameDestroyOpcode() ||
+ I->isCall())
+ break;
+ }
+
+ // If we could not find a frame destroy opcode, then it has already
+ // been simplified, so we don't care.
+ if (I->getOpcode() != getCallFrameDestroyOpcode())
+ return 0;
+
+ return -(I->getOperand(1).getImm());
+ }
+
+ // Currently handle only PUSHes we can reasonably expect to see
+ // in call sequences
+ switch (MI->getOpcode()) {
+ default:
+ return 0;
+ case X86::PUSH32i8:
+ case X86::PUSH32r:
+ case X86::PUSH32rmm:
+ case X86::PUSH32rmr:
+ case X86::PUSHi32:
+ return 4;
+ }
+}
+
/// isFrameOperand - Return true and the FrameIndex if the specified
/// operand and follow operands form a reference to the stack frame.
bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
index 5662e86..4d15467 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
@@ -175,6 +175,11 @@ public:
///
const X86RegisterInfo &getRegisterInfo() const { return RI; }
+ /// getSPAdjust - This returns the stack pointer adjustment made by
+ /// this instruction. For x86, we need to handle more complex call
+ /// sequences involving PUSHes.
+ int getSPAdjust(const MachineInstr *MI) const override;
+
/// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
/// extension instruction. That is, it's like a copy where it's legal for the
/// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index b23a744..9fd03a7 100644
--- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -77,6 +77,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
unsigned ArgumentStackSize;
/// NumLocalDynamics - Number of local-dynamic TLS accesses.
unsigned NumLocalDynamics;
+ /// HasPushSequences - Keeps track of whether this function uses sequences
+ /// of pushes to pass function parameters.
+ bool HasPushSequences;
private:
/// ForwardedMustTailRegParms - A list of virtual and physical registers
@@ -97,7 +100,8 @@ public:
VarArgsGPOffset(0),
VarArgsFPOffset(0),
ArgumentStackSize(0),
- NumLocalDynamics(0) {}
+ NumLocalDynamics(0),
+ HasPushSequences(false) {}
explicit X86MachineFunctionInfo(MachineFunction &MF)
: ForceFramePointer(false),
@@ -113,11 +117,15 @@ public:
VarArgsGPOffset(0),
VarArgsFPOffset(0),
ArgumentStackSize(0),
- NumLocalDynamics(0) {}
+ NumLocalDynamics(0),
+ HasPushSequences(false) {}
bool getForceFramePointer() const { return ForceFramePointer;}
void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
+ bool getHasPushSequences() const { return HasPushSequences; }
+ void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; }
+
bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; }
void setRestoreBasePointer(const MachineFunction *MF);
int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; }
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 09e651c..0fa38f4 100644
--- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -468,8 +468,6 @@ void
X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
- assert(SPAdj == 0 && "Unexpected");
-
MachineInstr &MI = *II;
MachineFunction &MF = *MI.getParent()->getParent();
const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
@@ -506,6 +504,9 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
} else
FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex);
+ if (BasePtr == StackPtr)
+ FIOffset += SPAdj;
+
// The frame index format for stackmaps and patchpoints is different from the
// X86 format. It only has a FI and an offset.
if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) {
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
index 5e6aa7d..1fc6b20 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -154,6 +154,7 @@ public:
void addIRPasses() override;
bool addInstSelector() override;
bool addILPOpts() override;
+ void addPreRegAlloc() override;
void addPostRegAlloc() override;
void addPreEmitPass() override;
};
@@ -187,6 +188,10 @@ bool X86PassConfig::addILPOpts() {
return true;
}
+void X86PassConfig::addPreRegAlloc() {
+ addPass(createX86CallFrameOptimization());
+}
+
void X86PassConfig::addPostRegAlloc() {
addPass(createX86FloatingPointStackifierPass());
}
OpenPOWER on IntegriCloud