diff options
Diffstat (limited to 'contrib/llvm/patches/patch-07-llvm-r227752-boot2-shrink.diff')
-rw-r--r-- | contrib/llvm/patches/patch-07-llvm-r227752-boot2-shrink.diff | 1271 |
1 files changed, 0 insertions, 1271 deletions
diff --git a/contrib/llvm/patches/patch-07-llvm-r227752-boot2-shrink.diff b/contrib/llvm/patches/patch-07-llvm-r227752-boot2-shrink.diff deleted file mode 100644 index 57e16d7..0000000 --- a/contrib/llvm/patches/patch-07-llvm-r227752-boot2-shrink.diff +++ /dev/null @@ -1,1271 +0,0 @@ -Pull in r227752 from upstream llvm trunk (by Michael Kuperstein): - - [X86] Convert esp-relative movs of function arguments to pushes, step 2 - - This moves the transformation introduced in r223757 into a separate MI pass. - This allows it to cover many more cases (not only cases where there must be a - reserved call frame), and perform rudimentary call folding. It still doesn't - have a heuristic, so it is enabled only for optsize/minsize, with stack - alignment <= 8, where it ought to be a fairly clear win. - - (Re-commit of r227728) - - Differential Revision: http://reviews.llvm.org/D6789 - -This helps to get sys/boot/i386/boot2 below the required size again, -when optimizing with -Oz. - -Introduced here: http://svnweb.freebsd.org/changeset/base/278112 - -Index: include/llvm/Target/TargetFrameLowering.h -=================================================================== ---- include/llvm/Target/TargetFrameLowering.h -+++ include/llvm/Target/TargetFrameLowering.h -@@ -193,6 +193,11 @@ class TargetFrameLowering { - return hasReservedCallFrame(MF) || hasFP(MF); - } - -+ // needsFrameIndexResolution - Do we need to perform FI resolution for -+ // this function. Normally, this is required only when the function -+ // has any stack objects. However, targets may want to override this. -+ virtual bool needsFrameIndexResolution(const MachineFunction &MF) const; -+ - /// getFrameIndexOffset - Returns the displacement from the frame register to - /// the stack frame of the specified index. - virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const; -Index: lib/CodeGen/PrologEpilogInserter.cpp -=================================================================== ---- lib/CodeGen/PrologEpilogInserter.cpp -+++ lib/CodeGen/PrologEpilogInserter.cpp -@@ -703,7 +703,8 @@ void PEI::insertPrologEpilogCode(MachineFunction & - /// register references and actual offsets. - /// - void PEI::replaceFrameIndices(MachineFunction &Fn) { -- if (!Fn.getFrameInfo()->hasStackObjects()) return; // Nothing to do? -+ const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); -+ if (!TFI.needsFrameIndexResolution(Fn)) return; - - // Store SPAdj at exit of a basic block. - SmallVector<int, 8> SPState; -@@ -769,13 +770,6 @@ void PEI::replaceFrameIndices(MachineBasicBlock *B - continue; - } - -- // If we are looking at a call sequence, we need to keep track of -- // the SP adjustment made by each instruction in the sequence. -- // This includes both the frame setup/destroy pseudos (handled above), -- // as well as other instructions that have side effects w.r.t the SP. -- if (InsideCallSequence) -- SPAdj += TII.getSPAdjust(I); -- - MachineInstr *MI = I; - bool DoIncr = true; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { -@@ -854,6 +848,16 @@ void PEI::replaceFrameIndices(MachineBasicBlock *B - break; - } - -+ // If we are looking at a call sequence, we need to keep track of -+ // the SP adjustment made by each instruction in the sequence. -+ // This includes both the frame setup/destroy pseudos (handled above), -+ // as well as other instructions that have side effects w.r.t the SP. -+ // Note that this must come after eliminateFrameIndex, because -+ // if I itself referred to a frame index, we shouldn't count its own -+ // adjustment. -+ if (MI && InsideCallSequence) -+ SPAdj += TII.getSPAdjust(MI); -+ - if (DoIncr && I != BB->end()) ++I; - - // Update register states. -Index: lib/CodeGen/TargetFrameLoweringImpl.cpp -=================================================================== ---- lib/CodeGen/TargetFrameLoweringImpl.cpp -+++ lib/CodeGen/TargetFrameLoweringImpl.cpp -@@ -42,3 +42,8 @@ int TargetFrameLowering::getFrameIndexReference(co - FrameReg = RI->getFrameRegister(MF); - return getFrameIndexOffset(MF, FI); - } -+ -+bool TargetFrameLowering::needsFrameIndexResolution( -+ const MachineFunction &MF) const { -+ return MF.getFrameInfo()->hasStackObjects(); -+} -Index: lib/Target/X86/CMakeLists.txt -=================================================================== ---- lib/Target/X86/CMakeLists.txt -+++ lib/Target/X86/CMakeLists.txt -@@ -14,6 +14,7 @@ add_public_tablegen_target(X86CommonTableGen) - - set(sources - X86AsmPrinter.cpp -+ X86CallFrameOptimization.cpp - X86FastISel.cpp - X86FloatingPoint.cpp - X86FrameLowering.cpp -Index: lib/Target/X86/X86.h -=================================================================== ---- lib/Target/X86/X86.h -+++ lib/Target/X86/X86.h -@@ -67,6 +67,11 @@ FunctionPass *createX86PadShortFunctions(); - /// to eliminate execution delays in some Atom processors. - FunctionPass *createX86FixupLEAs(); - -+/// createX86CallFrameOptimization - Return a pass that optimizes -+/// the code-size of x86 call sequences. This is done by replacing -+/// esp-relative movs with pushes. -+FunctionPass *createX86CallFrameOptimization(); -+ - } // End llvm namespace - - #endif -Index: lib/Target/X86/X86CallFrameOptimization.cpp -=================================================================== ---- lib/Target/X86/X86CallFrameOptimization.cpp -+++ lib/Target/X86/X86CallFrameOptimization.cpp -@@ -0,0 +1,400 @@ -+//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===// -+// -+// The LLVM Compiler Infrastructure -+// -+// This file is distributed under the University of Illinois Open Source -+// License. See LICENSE.TXT for details. -+// -+//===----------------------------------------------------------------------===// -+// -+// This file defines a pass that optimizes call sequences on x86. -+// Currently, it converts movs of function parameters onto the stack into -+// pushes. This is beneficial for two main reasons: -+// 1) The push instruction encoding is much smaller than an esp-relative mov -+// 2) It is possible to push memory arguments directly. So, if the -+// the transformation is preformed pre-reg-alloc, it can help relieve -+// register pressure. -+// -+//===----------------------------------------------------------------------===// -+ -+#include <algorithm> -+ -+#include "X86.h" -+#include "X86InstrInfo.h" -+#include "X86Subtarget.h" -+#include "X86MachineFunctionInfo.h" -+#include "llvm/ADT/Statistic.h" -+#include "llvm/CodeGen/MachineFunctionPass.h" -+#include "llvm/CodeGen/MachineInstrBuilder.h" -+#include "llvm/CodeGen/MachineRegisterInfo.h" -+#include "llvm/CodeGen/Passes.h" -+#include "llvm/IR/Function.h" -+#include "llvm/Support/Debug.h" -+#include "llvm/Support/raw_ostream.h" -+#include "llvm/Target/TargetInstrInfo.h" -+ -+using namespace llvm; -+ -+#define DEBUG_TYPE "x86-cf-opt" -+ -+cl::opt<bool> NoX86CFOpt("no-x86-call-frame-opt", -+ cl::desc("Avoid optimizing x86 call frames for size"), -+ cl::init(false), cl::Hidden); -+ -+namespace { -+class X86CallFrameOptimization : public MachineFunctionPass { -+public: -+ X86CallFrameOptimization() : MachineFunctionPass(ID) {} -+ -+ bool runOnMachineFunction(MachineFunction &MF) override; -+ -+private: -+ bool shouldPerformTransformation(MachineFunction &MF); -+ -+ bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock &MBB, -+ MachineBasicBlock::iterator I); -+ -+ MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup, -+ unsigned Reg); -+ -+ const char *getPassName() const override { -+ return "X86 Optimize Call Frame"; -+ } -+ -+ const TargetInstrInfo *TII; -+ const TargetFrameLowering *TFL; -+ const MachineRegisterInfo *MRI; -+ static char ID; -+}; -+ -+char X86CallFrameOptimization::ID = 0; -+} -+ -+FunctionPass *llvm::createX86CallFrameOptimization() { -+ return new X86CallFrameOptimization(); -+} -+ -+// This checks whether the transformation is legal and profitable -+bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) { -+ if (NoX86CFOpt.getValue()) -+ return false; -+ -+ // We currently only support call sequences where *all* parameters. -+ // are passed on the stack. -+ // No point in running this in 64-bit mode, since some arguments are -+ // passed in-register in all common calling conventions, so the pattern -+ // we're looking for will never match. -+ const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); -+ if (STI.is64Bit()) -+ return false; -+ -+ // You would expect straight-line code between call-frame setup and -+ // call-frame destroy. You would be wrong. There are circumstances (e.g. -+ // CMOV_GR8 expansion of a select that feeds a function call!) where we can -+ // end up with the setup and the destroy in different basic blocks. -+ // This is bad, and breaks SP adjustment. -+ // So, check that all of the frames in the function are closed inside -+ // the same block, and, for good measure, that there are no nested frames. -+ int FrameSetupOpcode = TII->getCallFrameSetupOpcode(); -+ int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); -+ for (MachineBasicBlock &BB : MF) { -+ bool InsideFrameSequence = false; -+ for (MachineInstr &MI : BB) { -+ if (MI.getOpcode() == FrameSetupOpcode) { -+ if (InsideFrameSequence) -+ return false; -+ InsideFrameSequence = true; -+ } -+ else if (MI.getOpcode() == FrameDestroyOpcode) { -+ if (!InsideFrameSequence) -+ return false; -+ InsideFrameSequence = false; -+ } -+ } -+ -+ if (InsideFrameSequence) -+ return false; -+ } -+ -+ // Now that we know the transformation is legal, check if it is -+ // profitable. -+ // TODO: Add a heuristic that actually looks at the function, -+ // and enable this for more cases. -+ -+ // This transformation is always a win when we expected to have -+ // a reserved call frame. Under other circumstances, it may be either -+ // a win or a loss, and requires a heuristic. -+ // For now, enable it only for the relatively clear win cases. -+ bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects(); -+ if (CannotReserveFrame) -+ return true; -+ -+ // For now, don't even try to evaluate the profitability when -+ // not optimizing for size. -+ AttributeSet FnAttrs = MF.getFunction()->getAttributes(); -+ bool OptForSize = -+ FnAttrs.hasAttribute(AttributeSet::FunctionIndex, -+ Attribute::OptimizeForSize) || -+ FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); -+ -+ if (!OptForSize) -+ return false; -+ -+ // Stack re-alignment can make this unprofitable even in terms of size. -+ // As mentioned above, a better heuristic is needed. For now, don't do this -+ // when the required alignment is above 8. (4 would be the safe choice, but -+ // some experimentation showed 8 is generally good). -+ if (TFL->getStackAlignment() > 8) -+ return false; -+ -+ return true; -+} -+ -+bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { -+ TII = MF.getSubtarget().getInstrInfo(); -+ TFL = MF.getSubtarget().getFrameLowering(); -+ MRI = &MF.getRegInfo(); -+ -+ if (!shouldPerformTransformation(MF)) -+ return false; -+ -+ int FrameSetupOpcode = TII->getCallFrameSetupOpcode(); -+ -+ bool Changed = false; -+ -+ for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) -+ for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) -+ if (I->getOpcode() == FrameSetupOpcode) -+ Changed |= adjustCallSequence(MF, *BB, I); -+ -+ return Changed; -+} -+ -+bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, -+ MachineBasicBlock &MBB, -+ MachineBasicBlock::iterator I) { -+ -+ // Check that this particular call sequence is amenable to the -+ // transformation. -+ const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( -+ MF.getSubtarget().getRegisterInfo()); -+ unsigned StackPtr = RegInfo.getStackRegister(); -+ int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); -+ -+ // We expect to enter this at the beginning of a call sequence -+ assert(I->getOpcode() == TII->getCallFrameSetupOpcode()); -+ MachineBasicBlock::iterator FrameSetup = I++; -+ -+ -+ // For globals in PIC mode, we can have some LEAs here. -+ // Ignore them, they don't bother us. -+ // TODO: Extend this to something that covers more cases. -+ while (I->getOpcode() == X86::LEA32r) -+ ++I; -+ -+ // We expect a copy instruction here. -+ // TODO: The copy instruction is a lowering artifact. -+ // We should also support a copy-less version, where the stack -+ // pointer is used directly. -+ if (!I->isCopy() || !I->getOperand(0).isReg()) -+ return false; -+ MachineBasicBlock::iterator SPCopy = I++; -+ StackPtr = SPCopy->getOperand(0).getReg(); -+ -+ // Scan the call setup sequence for the pattern we're looking for. -+ // We only handle a simple case - a sequence of MOV32mi or MOV32mr -+ // instructions, that push a sequence of 32-bit values onto the stack, with -+ // no gaps between them. -+ SmallVector<MachineInstr*, 4> MovVector(4, nullptr); -+ unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4; -+ if (MaxAdjust > 4) -+ MovVector.resize(MaxAdjust, nullptr); -+ -+ do { -+ int Opcode = I->getOpcode(); -+ if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr) -+ break; -+ -+ // We only want movs of the form: -+ // movl imm/r32, k(%esp) -+ // If we run into something else, bail. -+ // Note that AddrBaseReg may, counter to its name, not be a register, -+ // but rather a frame index. -+ // TODO: Support the fi case. This should probably work now that we -+ // have the infrastructure to track the stack pointer within a call -+ // sequence. -+ if (!I->getOperand(X86::AddrBaseReg).isReg() || -+ (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) || -+ !I->getOperand(X86::AddrScaleAmt).isImm() || -+ (I->getOperand(X86::AddrScaleAmt).getImm() != 1) || -+ (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) || -+ (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) || -+ !I->getOperand(X86::AddrDisp).isImm()) -+ return false; -+ -+ int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm(); -+ assert(StackDisp >= 0 && "Negative stack displacement when passing parameters"); -+ -+ // We really don't want to consider the unaligned case. -+ if (StackDisp % 4) -+ return false; -+ StackDisp /= 4; -+ -+ assert((size_t)StackDisp < MovVector.size() && -+ "Function call has more parameters than the stack is adjusted for."); -+ -+ // If the same stack slot is being filled twice, something's fishy. -+ if (MovVector[StackDisp] != nullptr) -+ return false; -+ MovVector[StackDisp] = I; -+ -+ ++I; -+ } while (I != MBB.end()); -+ -+ // We now expect the end of the sequence - a call and a stack adjust. -+ if (I == MBB.end()) -+ return false; -+ -+ // For PCrel calls, we expect an additional COPY of the basereg. -+ // If we find one, skip it. -+ if (I->isCopy()) { -+ if (I->getOperand(1).getReg() == -+ MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg()) -+ ++I; -+ else -+ return false; -+ } -+ -+ if (!I->isCall()) -+ return false; -+ MachineBasicBlock::iterator Call = I; -+ if ((++I)->getOpcode() != FrameDestroyOpcode) -+ return false; -+ -+ // Now, go through the vector, and see that we don't have any gaps, -+ // but only a series of 32-bit MOVs. -+ -+ int64_t ExpectedDist = 0; -+ auto MMI = MovVector.begin(), MME = MovVector.end(); -+ for (; MMI != MME; ++MMI, ExpectedDist += 4) -+ if (*MMI == nullptr) -+ break; -+ -+ // If the call had no parameters, do nothing -+ if (!ExpectedDist) -+ return false; -+ -+ // We are either at the last parameter, or a gap. -+ // Make sure it's not a gap -+ for (; MMI != MME; ++MMI) -+ if (*MMI != nullptr) -+ return false; -+ -+ // Ok, we can in fact do the transformation for this call. -+ // Do not remove the FrameSetup instruction, but adjust the parameters. -+ // PEI will end up finalizing the handling of this. -+ FrameSetup->getOperand(1).setImm(ExpectedDist); -+ -+ DebugLoc DL = I->getDebugLoc(); -+ // Now, iterate through the vector in reverse order, and replace the movs -+ // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to -+ // replace uses. -+ for (int Idx = (ExpectedDist / 4) - 1; Idx >= 0; --Idx) { -+ MachineBasicBlock::iterator MOV = *MovVector[Idx]; -+ MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); -+ if (MOV->getOpcode() == X86::MOV32mi) { -+ unsigned PushOpcode = X86::PUSHi32; -+ // If the operand is a small (8-bit) immediate, we can use a -+ // PUSH instruction with a shorter encoding. -+ // Note that isImm() may fail even though this is a MOVmi, because -+ // the operand can also be a symbol. -+ if (PushOp.isImm()) { -+ int64_t Val = PushOp.getImm(); -+ if (isInt<8>(Val)) -+ PushOpcode = X86::PUSH32i8; -+ } -+ BuildMI(MBB, Call, DL, TII->get(PushOpcode)).addOperand(PushOp); -+ } else { -+ unsigned int Reg = PushOp.getReg(); -+ -+ // If PUSHrmm is not slow on this target, try to fold the source of the -+ // push into the instruction. -+ const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>(); -+ bool SlowPUSHrmm = ST.isAtom() || ST.isSLM(); -+ -+ // Check that this is legal to fold. Right now, we're extremely -+ // conservative about that. -+ MachineInstr *DefMov = nullptr; -+ if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) { -+ MachineInstr *Push = BuildMI(MBB, Call, DL, TII->get(X86::PUSH32rmm)); -+ -+ unsigned NumOps = DefMov->getDesc().getNumOperands(); -+ for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) -+ Push->addOperand(DefMov->getOperand(i)); -+ -+ DefMov->eraseFromParent(); -+ } else { -+ BuildMI(MBB, Call, DL, TII->get(X86::PUSH32r)).addReg(Reg).getInstr(); -+ } -+ } -+ -+ MBB.erase(MOV); -+ } -+ -+ // The stack-pointer copy is no longer used in the call sequences. -+ // There should not be any other users, but we can't commit to that, so: -+ if (MRI->use_empty(SPCopy->getOperand(0).getReg())) -+ SPCopy->eraseFromParent(); -+ -+ // Once we've done this, we need to make sure PEI doesn't assume a reserved -+ // frame. -+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); -+ FuncInfo->setHasPushSequences(true); -+ -+ return true; -+} -+ -+MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush( -+ MachineBasicBlock::iterator FrameSetup, unsigned Reg) { -+ // Do an extremely restricted form of load folding. -+ // ISel will often create patterns like: -+ // movl 4(%edi), %eax -+ // movl 8(%edi), %ecx -+ // movl 12(%edi), %edx -+ // movl %edx, 8(%esp) -+ // movl %ecx, 4(%esp) -+ // movl %eax, (%esp) -+ // call -+ // Get rid of those with prejudice. -+ if (!TargetRegisterInfo::isVirtualRegister(Reg)) -+ return nullptr; -+ -+ // Make sure this is the only use of Reg. -+ if (!MRI->hasOneNonDBGUse(Reg)) -+ return nullptr; -+ -+ MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg); -+ -+ // Make sure the def is a MOV from memory. -+ // If the def is an another block, give up. -+ if (DefMI->getOpcode() != X86::MOV32rm || -+ DefMI->getParent() != FrameSetup->getParent()) -+ return nullptr; -+ -+ // Be careful with movs that load from a stack slot, since it may get -+ // resolved incorrectly. -+ // TODO: Again, we already have the infrastructure, so this should work. -+ if (!DefMI->getOperand(1).isReg()) -+ return nullptr; -+ -+ // Now, make sure everything else up until the ADJCALLSTACK is a sequence -+ // of MOVs. To be less conservative would require duplicating a lot of the -+ // logic from PeepholeOptimizer. -+ // FIXME: A possibly better approach would be to teach the PeepholeOptimizer -+ // to be smarter about folding into pushes. -+ for (auto I = DefMI; I != FrameSetup; ++I) -+ if (I->getOpcode() != X86::MOV32rm) -+ return nullptr; -+ -+ return DefMI; -+} -Index: lib/Target/X86/X86FastISel.cpp -=================================================================== ---- lib/Target/X86/X86FastISel.cpp -+++ lib/Target/X86/X86FastISel.cpp -@@ -2735,7 +2735,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo & - // Issue CALLSEQ_START - unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) -- .addImm(NumBytes); -+ .addImm(NumBytes).addImm(0); - - // Walk the register/memloc assignments, inserting copies/loads. - const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( -Index: lib/Target/X86/X86FrameLowering.cpp -=================================================================== ---- lib/Target/X86/X86FrameLowering.cpp -+++ lib/Target/X86/X86FrameLowering.cpp -@@ -38,9 +38,36 @@ using namespace llvm; - extern cl::opt<bool> ForceStackAlign; - - bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { -- return !MF.getFrameInfo()->hasVarSizedObjects(); -+ return !MF.getFrameInfo()->hasVarSizedObjects() && -+ !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences(); - } - -+/// canSimplifyCallFramePseudos - If there is a reserved call frame, the -+/// call frame pseudos can be simplified. Having a FP, as in the default -+/// implementation, is not sufficient here since we can't always use it. -+/// Use a more nuanced condition. -+bool -+X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { -+ const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *> -+ (MF.getSubtarget().getRegisterInfo()); -+ return hasReservedCallFrame(MF) || -+ (hasFP(MF) && !TRI->needsStackRealignment(MF)) -+ || TRI->hasBasePointer(MF); -+} -+ -+// needsFrameIndexResolution - Do we need to perform FI resolution for -+// this function. Normally, this is required only when the function -+// has any stack objects. However, FI resolution actually has another job, -+// not apparent from the title - it resolves callframesetup/destroy -+// that were not simplified earlier. -+// So, this is required for x86 functions that have push sequences even -+// when there are no stack objects. -+bool -+X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const { -+ return MF.getFrameInfo()->hasStackObjects() || -+ MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences(); -+} -+ - /// hasFP - Return true if the specified function should have a dedicated frame - /// pointer register. This is true if the function has variable sized allocas - /// or if frame pointer elimination is disabled. -@@ -93,16 +120,6 @@ static unsigned getANDriOpcode(bool IsLP64, int64_ - return X86::AND32ri; - } - --static unsigned getPUSHiOpcode(bool IsLP64, MachineOperand MO) { -- // We don't support LP64 for now. -- assert(!IsLP64); -- -- if (MO.isImm() && isInt<8>(MO.getImm())) -- return X86::PUSH32i8; -- -- return X86::PUSHi32;; --} -- - static unsigned getLEArOpcode(unsigned IsLP64) { - return IsLP64 ? X86::LEA64r : X86::LEA32r; - } -@@ -1882,100 +1899,6 @@ void X86FrameLowering::adjustForHiPEPrologue(Machi - #endif - } - --bool X86FrameLowering:: --convertArgMovsToPushes(MachineFunction &MF, MachineBasicBlock &MBB, -- MachineBasicBlock::iterator I, uint64_t Amount) const { -- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); -- const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( -- MF.getSubtarget().getRegisterInfo()); -- unsigned StackPtr = RegInfo.getStackRegister(); -- -- // Scan the call setup sequence for the pattern we're looking for. -- // We only handle a simple case now - a sequence of MOV32mi or MOV32mr -- // instructions, that push a sequence of 32-bit values onto the stack, with -- // no gaps. -- std::map<int64_t, MachineBasicBlock::iterator> MovMap; -- do { -- int Opcode = I->getOpcode(); -- if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr) -- break; -- -- // We only want movs of the form: -- // movl imm/r32, k(%ecx) -- // If we run into something else, bail -- // Note that AddrBaseReg may, counterintuitively, not be a register... -- if (!I->getOperand(X86::AddrBaseReg).isReg() || -- (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) || -- !I->getOperand(X86::AddrScaleAmt).isImm() || -- (I->getOperand(X86::AddrScaleAmt).getImm() != 1) || -- (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) || -- (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) || -- !I->getOperand(X86::AddrDisp).isImm()) -- return false; -- -- int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm(); -- -- // We don't want to consider the unaligned case. -- if (StackDisp % 4) -- return false; -- -- // If the same stack slot is being filled twice, something's fishy. -- if (!MovMap.insert(std::pair<int64_t, MachineInstr*>(StackDisp, I)).second) -- return false; -- -- ++I; -- } while (I != MBB.end()); -- -- // We now expect the end of the sequence - a call and a stack adjust. -- if (I == MBB.end()) -- return false; -- if (!I->isCall()) -- return false; -- MachineBasicBlock::iterator Call = I; -- if ((++I)->getOpcode() != TII.getCallFrameDestroyOpcode()) -- return false; -- -- // Now, go through the map, and see that we don't have any gaps, -- // but only a series of 32-bit MOVs. -- // Since std::map provides ordered iteration, the original order -- // of the MOVs doesn't matter. -- int64_t ExpectedDist = 0; -- for (auto MMI = MovMap.begin(), MME = MovMap.end(); MMI != MME; -- ++MMI, ExpectedDist += 4) -- if (MMI->first != ExpectedDist) -- return false; -- -- // Ok, everything looks fine. Do the transformation. -- DebugLoc DL = I->getDebugLoc(); -- -- // It's possible the original stack adjustment amount was larger than -- // that done by the pushes. If so, we still need a SUB. -- Amount -= ExpectedDist; -- if (Amount) { -- MachineInstr* Sub = BuildMI(MBB, Call, DL, -- TII.get(getSUBriOpcode(false, Amount)), StackPtr) -- .addReg(StackPtr).addImm(Amount); -- Sub->getOperand(3).setIsDead(); -- } -- -- // Now, iterate through the map in reverse order, and replace the movs -- // with pushes. MOVmi/MOVmr doesn't have any defs, so need to replace uses. -- for (auto MMI = MovMap.rbegin(), MME = MovMap.rend(); MMI != MME; ++MMI) { -- MachineBasicBlock::iterator MOV = MMI->second; -- MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); -- -- // Replace MOVmr with PUSH32r, and MOVmi with PUSHi of appropriate size -- int PushOpcode = X86::PUSH32r; -- if (MOV->getOpcode() == X86::MOV32mi) -- PushOpcode = getPUSHiOpcode(false, PushOp); -- -- BuildMI(MBB, Call, DL, TII.get(PushOpcode)).addOperand(PushOp); -- MBB.erase(MOV); -- } -- -- return true; --} -- - void X86FrameLowering:: - eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const { -@@ -1990,7 +1913,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, - bool IsLP64 = STI.isTarget64BitLP64(); - DebugLoc DL = I->getDebugLoc(); - uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0; -- uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0; -+ uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0; - I = MBB.erase(I); - - if (!reserveCallFrame) { -@@ -2010,24 +1933,18 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, - Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign; - - MachineInstr *New = nullptr; -- if (Opcode == TII.getCallFrameSetupOpcode()) { -- // Try to convert movs to the stack into pushes. -- // We currently only look for a pattern that appears in 32-bit -- // calling conventions. -- if (!IsLP64 && convertArgMovsToPushes(MF, MBB, I, Amount)) -- return; - -- New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), -- StackPtr) -- .addReg(StackPtr) -- .addImm(Amount); -- } else { -- assert(Opcode == TII.getCallFrameDestroyOpcode()); -+ // Factor out the amount that gets handled inside the sequence -+ // (Pushes of argument for frame setup, callee pops for frame destroy) -+ Amount -= InternalAmt; - -- // Factor out the amount the callee already popped. -- Amount -= CalleeAmt; -+ if (Amount) { -+ if (Opcode == TII.getCallFrameSetupOpcode()) { -+ New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), StackPtr) -+ .addReg(StackPtr).addImm(Amount); -+ } else { -+ assert(Opcode == TII.getCallFrameDestroyOpcode()); - -- if (Amount) { - unsigned Opc = getADDriOpcode(IsLP64, Amount); - New = BuildMI(MF, DL, TII.get(Opc), StackPtr) - .addReg(StackPtr).addImm(Amount); -@@ -2045,13 +1962,13 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, - return; - } - -- if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) { -+ if (Opcode == TII.getCallFrameDestroyOpcode() && InternalAmt) { - // If we are performing frame pointer elimination and if the callee pops - // something off the stack pointer, add it back. We do this until we have - // more advanced stack pointer tracking ability. -- unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt); -+ unsigned Opc = getSUBriOpcode(IsLP64, InternalAmt); - MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr) -- .addReg(StackPtr).addImm(CalleeAmt); -+ .addReg(StackPtr).addImm(InternalAmt); - - // The EFLAGS implicit def is dead. - New->getOperand(3).setIsDead(); -Index: lib/Target/X86/X86FrameLowering.h -=================================================================== ---- lib/Target/X86/X86FrameLowering.h -+++ lib/Target/X86/X86FrameLowering.h -@@ -66,6 +66,8 @@ class X86FrameLowering : public TargetFrameLowerin - - bool hasFP(const MachineFunction &MF) const override; - bool hasReservedCallFrame(const MachineFunction &MF) const override; -+ bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override; -+ bool needsFrameIndexResolution(const MachineFunction &MF) const override; - - int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; - int getFrameIndexReference(const MachineFunction &MF, int FI, -Index: lib/Target/X86/X86InstrCompiler.td -=================================================================== ---- lib/Target/X86/X86InstrCompiler.td -+++ lib/Target/X86/X86InstrCompiler.td -@@ -43,9 +43,9 @@ let hasSideEffects = 0, isNotDuplicable = 1, Uses - // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become - // sub / add which can clobber EFLAGS. - let Defs = [ESP, EFLAGS], Uses = [ESP] in { --def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt), -+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), - "#ADJCALLSTACKDOWN", -- [(X86callseq_start timm:$amt)]>, -+ []>, - Requires<[NotLP64]>; - def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), - "#ADJCALLSTACKUP", -@@ -52,7 +52,10 @@ def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins - [(X86callseq_end timm:$amt1, timm:$amt2)]>, - Requires<[NotLP64]>; - } -+def : Pat<(X86callseq_start timm:$amt1), -+ (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>; - -+ - // ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into - // a stack adjustment and the codegen must know that they may modify the stack - // pointer before prolog-epilog rewriting occurs. -@@ -59,9 +62,9 @@ def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins - // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become - // sub / add which can clobber EFLAGS. - let Defs = [RSP, EFLAGS], Uses = [RSP] in { --def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt), -+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), - "#ADJCALLSTACKDOWN", -- [(X86callseq_start timm:$amt)]>, -+ []>, - Requires<[IsLP64]>; - def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), - "#ADJCALLSTACKUP", -@@ -68,9 +71,10 @@ def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins - [(X86callseq_end timm:$amt1, timm:$amt2)]>, - Requires<[IsLP64]>; - } -+def : Pat<(X86callseq_start timm:$amt1), -+ (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>; - - -- - // x86-64 va_start lowering magic. - let usesCustomInserter = 1, Defs = [EFLAGS] in { - def VASTART_SAVE_XMM_REGS : I<0, Pseudo, -Index: lib/Target/X86/X86InstrInfo.cpp -=================================================================== ---- lib/Target/X86/X86InstrInfo.cpp -+++ lib/Target/X86/X86InstrInfo.cpp -@@ -1692,6 +1692,58 @@ X86InstrInfo::isCoalescableExtInstr(const MachineI - return false; - } - -+int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const { -+ const MachineFunction *MF = MI->getParent()->getParent(); -+ const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); -+ -+ if (MI->getOpcode() == getCallFrameSetupOpcode() || -+ MI->getOpcode() == getCallFrameDestroyOpcode()) { -+ unsigned StackAlign = TFI->getStackAlignment(); -+ int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign * -+ StackAlign; -+ -+ SPAdj -= MI->getOperand(1).getImm(); -+ -+ if (MI->getOpcode() == getCallFrameSetupOpcode()) -+ return SPAdj; -+ else -+ return -SPAdj; -+ } -+ -+ // To know whether a call adjusts the stack, we need information -+ // that is bound to the following ADJCALLSTACKUP pseudo. -+ // Look for the next ADJCALLSTACKUP that follows the call. -+ if (MI->isCall()) { -+ const MachineBasicBlock* MBB = MI->getParent(); -+ auto I = ++MachineBasicBlock::const_iterator(MI); -+ for (auto E = MBB->end(); I != E; ++I) { -+ if (I->getOpcode() == getCallFrameDestroyOpcode() || -+ I->isCall()) -+ break; -+ } -+ -+ // If we could not find a frame destroy opcode, then it has already -+ // been simplified, so we don't care. -+ if (I->getOpcode() != getCallFrameDestroyOpcode()) -+ return 0; -+ -+ return -(I->getOperand(1).getImm()); -+ } -+ -+ // Currently handle only PUSHes we can reasonably expect to see -+ // in call sequences -+ switch (MI->getOpcode()) { -+ default: -+ return 0; -+ case X86::PUSH32i8: -+ case X86::PUSH32r: -+ case X86::PUSH32rmm: -+ case X86::PUSH32rmr: -+ case X86::PUSHi32: -+ return 4; -+ } -+} -+ - /// isFrameOperand - Return true and the FrameIndex if the specified - /// operand and follow operands form a reference to the stack frame. - bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op, -Index: lib/Target/X86/X86InstrInfo.h -=================================================================== ---- lib/Target/X86/X86InstrInfo.h -+++ lib/Target/X86/X86InstrInfo.h -@@ -175,6 +175,11 @@ class X86InstrInfo final : public X86GenInstrInfo - /// - const X86RegisterInfo &getRegisterInfo() const { return RI; } - -+ /// getSPAdjust - This returns the stack pointer adjustment made by -+ /// this instruction. For x86, we need to handle more complex call -+ /// sequences involving PUSHes. -+ int getSPAdjust(const MachineInstr *MI) const override; -+ - /// isCoalescableExtInstr - Return true if the instruction is a "coalescable" - /// extension instruction. That is, it's like a copy where it's legal for the - /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns -Index: lib/Target/X86/X86MachineFunctionInfo.h -=================================================================== ---- lib/Target/X86/X86MachineFunctionInfo.h -+++ lib/Target/X86/X86MachineFunctionInfo.h -@@ -77,6 +77,9 @@ class X86MachineFunctionInfo : public MachineFunct - unsigned ArgumentStackSize; - /// NumLocalDynamics - Number of local-dynamic TLS accesses. - unsigned NumLocalDynamics; -+ /// HasPushSequences - Keeps track of whether this function uses sequences -+ /// of pushes to pass function parameters. -+ bool HasPushSequences; - - private: - /// ForwardedMustTailRegParms - A list of virtual and physical registers -@@ -97,7 +100,8 @@ class X86MachineFunctionInfo : public MachineFunct - VarArgsGPOffset(0), - VarArgsFPOffset(0), - ArgumentStackSize(0), -- NumLocalDynamics(0) {} -+ NumLocalDynamics(0), -+ HasPushSequences(false) {} - - explicit X86MachineFunctionInfo(MachineFunction &MF) - : ForceFramePointer(false), -@@ -113,11 +117,15 @@ class X86MachineFunctionInfo : public MachineFunct - VarArgsGPOffset(0), - VarArgsFPOffset(0), - ArgumentStackSize(0), -- NumLocalDynamics(0) {} -+ NumLocalDynamics(0), -+ HasPushSequences(false) {} - - bool getForceFramePointer() const { return ForceFramePointer;} - void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } - -+ bool getHasPushSequences() const { return HasPushSequences; } -+ void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; } -+ - bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; } - void setRestoreBasePointer(const MachineFunction *MF); - int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; } -Index: lib/Target/X86/X86RegisterInfo.cpp -=================================================================== ---- lib/Target/X86/X86RegisterInfo.cpp -+++ lib/Target/X86/X86RegisterInfo.cpp -@@ -468,8 +468,6 @@ void - X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, - int SPAdj, unsigned FIOperandNum, - RegScavenger *RS) const { -- assert(SPAdj == 0 && "Unexpected"); -- - MachineInstr &MI = *II; - MachineFunction &MF = *MI.getParent()->getParent(); - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); -@@ -506,6 +504,9 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicB - } else - FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex); - -+ if (BasePtr == StackPtr) -+ FIOffset += SPAdj; -+ - // The frame index format for stackmaps and patchpoints is different from the - // X86 format. It only has a FI and an offset. - if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) { -Index: lib/Target/X86/X86TargetMachine.cpp -=================================================================== ---- lib/Target/X86/X86TargetMachine.cpp -+++ lib/Target/X86/X86TargetMachine.cpp -@@ -154,6 +154,7 @@ class X86PassConfig : public TargetPassConfig { - void addIRPasses() override; - bool addInstSelector() override; - bool addILPOpts() override; -+ void addPreRegAlloc() override; - void addPostRegAlloc() override; - void addPreEmitPass() override; - }; -@@ -187,6 +188,10 @@ bool X86PassConfig::addILPOpts() { - return true; - } - -+void X86PassConfig::addPreRegAlloc() { -+ addPass(createX86CallFrameOptimization()); -+} -+ - void X86PassConfig::addPostRegAlloc() { - addPass(createX86FloatingPointStackifierPass()); - } -Index: test/CodeGen/X86/inalloca-invoke.ll -=================================================================== ---- test/CodeGen/X86/inalloca-invoke.ll -+++ test/CodeGen/X86/inalloca-invoke.ll -@@ -31,7 +31,7 @@ blah: - to label %invoke.cont unwind label %lpad - - ; Uses end as sret param. --; CHECK: movl %[[end]], (%esp) -+; CHECK: pushl %[[end]] - ; CHECK: calll _plus - - invoke.cont: -Index: test/CodeGen/X86/movtopush.ll -=================================================================== ---- test/CodeGen/X86/movtopush.ll -+++ test/CodeGen/X86/movtopush.ll -@@ -1,10 +1,12 @@ - ; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL -+; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64 - ; RUN: llc < %s -mtriple=i686-windows -force-align-stack -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED -+ - declare void @good(i32 %a, i32 %b, i32 %c, i32 %d) - declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d) - - ; Here, we should have a reserved frame, so we don't expect pushes --; NORMAL-LABEL: test1 -+; NORMAL-LABEL: test1: - ; NORMAL: subl $16, %esp - ; NORMAL-NEXT: movl $4, 12(%esp) - ; NORMAL-NEXT: movl $3, 8(%esp) -@@ -11,6 +13,7 @@ declare void @inreg(i32 %a, i32 inreg %b, i32 %c, - ; NORMAL-NEXT: movl $2, 4(%esp) - ; NORMAL-NEXT: movl $1, (%esp) - ; NORMAL-NEXT: call -+; NORMAL-NEXT: addl $16, %esp - define void @test1() { - entry: - call void @good(i32 1, i32 2, i32 3, i32 4) -@@ -17,8 +20,10 @@ entry: - ret void - } - --; Here, we expect a sequence of 4 immediate pushes --; NORMAL-LABEL: test2 -+; We're optimizing for code size, so we should get pushes for x86, -+; even though there is a reserved call frame. -+; Make sure we don't touch x86-64 -+; NORMAL-LABEL: test1b: - ; NORMAL-NOT: subl {{.*}} %esp - ; NORMAL: pushl $4 - ; NORMAL-NEXT: pushl $3 -@@ -25,6 +30,42 @@ entry: - ; NORMAL-NEXT: pushl $2 - ; NORMAL-NEXT: pushl $1 - ; NORMAL-NEXT: call -+; NORMAL-NEXT: addl $16, %esp -+; X64-LABEL: test1b: -+; X64: movl $1, %ecx -+; X64-NEXT: movl $2, %edx -+; X64-NEXT: movl $3, %r8d -+; X64-NEXT: movl $4, %r9d -+; X64-NEXT: callq good -+define void @test1b() optsize { -+entry: -+ call void @good(i32 1, i32 2, i32 3, i32 4) -+ ret void -+} -+ -+; Same as above, but for minsize -+; NORMAL-LABEL: test1c: -+; NORMAL-NOT: subl {{.*}} %esp -+; NORMAL: pushl $4 -+; NORMAL-NEXT: pushl $3 -+; NORMAL-NEXT: pushl $2 -+; NORMAL-NEXT: pushl $1 -+; NORMAL-NEXT: call -+; NORMAL-NEXT: addl $16, %esp -+define void @test1c() minsize { -+entry: -+ call void @good(i32 1, i32 2, i32 3, i32 4) -+ ret void -+} -+ -+; If we have a reserved frame, we should have pushes -+; NORMAL-LABEL: test2: -+; NORMAL-NOT: subl {{.*}} %esp -+; NORMAL: pushl $4 -+; NORMAL-NEXT: pushl $3 -+; NORMAL-NEXT: pushl $2 -+; NORMAL-NEXT: pushl $1 -+; NORMAL-NEXT: call - define void @test2(i32 %k) { - entry: - %a = alloca i32, i32 %k -@@ -34,7 +75,7 @@ entry: - - ; Again, we expect a sequence of 4 immediate pushes - ; Checks that we generate the right pushes for >8bit immediates --; NORMAL-LABEL: test2b -+; NORMAL-LABEL: test2b: - ; NORMAL-NOT: subl {{.*}} %esp - ; NORMAL: pushl $4096 - ; NORMAL-NEXT: pushl $3072 -@@ -41,15 +82,15 @@ entry: - ; NORMAL-NEXT: pushl $2048 - ; NORMAL-NEXT: pushl $1024 - ; NORMAL-NEXT: call --define void @test2b(i32 %k) { -+; NORMAL-NEXT: addl $16, %esp -+define void @test2b() optsize { - entry: -- %a = alloca i32, i32 %k - call void @good(i32 1024, i32 2048, i32 3072, i32 4096) - ret void - } - - ; The first push should push a register --; NORMAL-LABEL: test3 -+; NORMAL-LABEL: test3: - ; NORMAL-NOT: subl {{.*}} %esp - ; NORMAL: pushl $4 - ; NORMAL-NEXT: pushl $3 -@@ -56,15 +97,15 @@ entry: - ; NORMAL-NEXT: pushl $2 - ; NORMAL-NEXT: pushl %e{{..}} - ; NORMAL-NEXT: call --define void @test3(i32 %k) { -+; NORMAL-NEXT: addl $16, %esp -+define void @test3(i32 %k) optsize { - entry: -- %a = alloca i32, i32 %k - call void @good(i32 %k, i32 2, i32 3, i32 4) - ret void - } - - ; We don't support weird calling conventions --; NORMAL-LABEL: test4 -+; NORMAL-LABEL: test4: - ; NORMAL: subl $12, %esp - ; NORMAL-NEXT: movl $4, 8(%esp) - ; NORMAL-NEXT: movl $3, 4(%esp) -@@ -71,16 +112,16 @@ entry: - ; NORMAL-NEXT: movl $1, (%esp) - ; NORMAL-NEXT: movl $2, %eax - ; NORMAL-NEXT: call --define void @test4(i32 %k) { -+; NORMAL-NEXT: addl $12, %esp -+define void @test4() optsize { - entry: -- %a = alloca i32, i32 %k - call void @inreg(i32 1, i32 2, i32 3, i32 4) - ret void - } - --; Check that additional alignment is added when the pushes --; don't add up to the required alignment. --; ALIGNED-LABEL: test5 -+; When there is no reserved call frame, check that additional alignment -+; is added when the pushes don't add up to the required alignment. -+; ALIGNED-LABEL: test5: - ; ALIGNED: subl $16, %esp - ; ALIGNED-NEXT: pushl $4 - ; ALIGNED-NEXT: pushl $3 -@@ -97,7 +138,7 @@ entry: - ; Check that pushing the addresses of globals (Or generally, things that - ; aren't exactly immediates) isn't broken. - ; Fixes PR21878. --; NORMAL-LABEL: test6 -+; NORMAL-LABEL: test6: - ; NORMAL: pushl $_ext - ; NORMAL-NEXT: call - declare void @f(i8*) -@@ -110,3 +151,108 @@ bb: - alloca i32 - ret void - } -+ -+; Check that we fold simple cases into the push -+; NORMAL-LABEL: test7: -+; NORMAL-NOT: subl {{.*}} %esp -+; NORMAL: movl 4(%esp), [[EAX:%e..]] -+; NORMAL-NEXT: pushl $4 -+; NORMAL-NEXT: pushl ([[EAX]]) -+; NORMAL-NEXT: pushl $2 -+; NORMAL-NEXT: pushl $1 -+; NORMAL-NEXT: call -+; NORMAL-NEXT: addl $16, %esp -+define void @test7(i32* %ptr) optsize { -+entry: -+ %val = load i32* %ptr -+ call void @good(i32 1, i32 2, i32 %val, i32 4) -+ ret void -+} -+ -+; But we don't want to fold stack-relative loads into the push, -+; because the offset will be wrong -+; NORMAL-LABEL: test8: -+; NORMAL-NOT: subl {{.*}} %esp -+; NORMAL: movl 4(%esp), [[EAX:%e..]] -+; NORMAL-NEXT: pushl $4 -+; NORMAL-NEXT: pushl [[EAX]] -+; NORMAL-NEXT: pushl $2 -+; NORMAL-NEXT: pushl $1 -+; NORMAL-NEXT: call -+; NORMAL-NEXT: addl $16, %esp -+define void @test8(i32* %ptr) optsize { -+entry: -+ %val = ptrtoint i32* %ptr to i32 -+ call void @good(i32 1, i32 2, i32 %val, i32 4) -+ ret void -+} -+ -+; If one function is using push instructions, and the other isn't -+; (because it has frame-index references), then we must resolve -+; these references correctly. -+; NORMAL-LABEL: test9: -+; NORMAL-NOT: leal (%esp), -+; NORMAL: pushl $4 -+; NORMAL-NEXT: pushl $3 -+; NORMAL-NEXT: pushl $2 -+; NORMAL-NEXT: pushl $1 -+; NORMAL-NEXT: call -+; NORMAL-NEXT: addl $16, %esp -+; NORMAL-NEXT: subl $16, %esp -+; NORMAL-NEXT: leal 16(%esp), [[EAX:%e..]] -+; NORMAL-NEXT: movl [[EAX]], 12(%esp) -+; NORMAL-NEXT: movl $7, 8(%esp) -+; NORMAL-NEXT: movl $6, 4(%esp) -+; NORMAL-NEXT: movl $5, (%esp) -+; NORMAL-NEXT: call -+; NORMAL-NEXT: addl $16, %esp -+define void @test9() optsize { -+entry: -+ %p = alloca i32, align 4 -+ call void @good(i32 1, i32 2, i32 3, i32 4) -+ %0 = ptrtoint i32* %p to i32 -+ call void @good(i32 5, i32 6, i32 7, i32 %0) -+ ret void -+} -+ -+; We can end up with an indirect call which gets reloaded on the spot. -+; Make sure we reference the correct stack slot - we spill into (%esp) -+; and reload from 16(%esp) due to the pushes. -+; NORMAL-LABEL: test10: -+; NORMAL: movl $_good, [[ALLOC:.*]] -+; NORMAL-NEXT: movl [[ALLOC]], [[EAX:%e..]] -+; NORMAL-NEXT: movl [[EAX]], (%esp) # 4-byte Spill -+; NORMAL: nop -+; NORMAL: pushl $4 -+; NORMAL-NEXT: pushl $3 -+; NORMAL-NEXT: pushl $2 -+; NORMAL-NEXT: pushl $1 -+; NORMAL-NEXT: calll *16(%esp) -+; NORMAL-NEXT: addl $16, %esp -+define void @test10() optsize { -+ %stack_fptr = alloca void (i32, i32, i32, i32)* -+ store void (i32, i32, i32, i32)* @good, void (i32, i32, i32, i32)** %stack_fptr -+ %good_ptr = load volatile void (i32, i32, i32, i32)** %stack_fptr -+ call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di}"() -+ call void (i32, i32, i32, i32)* %good_ptr(i32 1, i32 2, i32 3, i32 4) -+ ret void -+} -+ -+; We can't fold the load from the global into the push because of -+; interference from the store -+; NORMAL-LABEL: test11: -+; NORMAL: movl _the_global, [[EAX:%e..]] -+; NORMAL-NEXT: movl $42, _the_global -+; NORMAL-NEXT: pushl $4 -+; NORMAL-NEXT: pushl $3 -+; NORMAL-NEXT: pushl $2 -+; NORMAL-NEXT: pushl [[EAX]] -+; NORMAL-NEXT: call -+; NORMAL-NEXT: addl $16, %esp -+@the_global = external global i32 -+define void @test11() optsize { -+ %myload = load i32* @the_global -+ store i32 42, i32* @the_global -+ call void @good(i32 %myload, i32 2, i32 3, i32 4) -+ ret void -+} |