diff options
Diffstat (limited to 'contrib/llvm/lib/Target/X86')
103 files changed, 113125 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp new file mode 100644 index 0000000..09cc53a --- /dev/null +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp @@ -0,0 +1,1077 @@ +//===-- X86AsmInstrumentation.cpp - Instrument X86 inline assembly C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86BaseInfo.h" +#include "X86AsmInstrumentation.h" +#include "X86Operand.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Triple.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstBuilder.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/MC/MCTargetOptions.h" +#include "llvm/Support/CommandLine.h" +#include <algorithm> +#include <cassert> +#include <vector> + +// Following comment describes how assembly instrumentation works. +// Currently we have only AddressSanitizer instrumentation, but we're +// planning to implement MemorySanitizer for inline assembly too. If +// you're not familiar with AddressSanitizer algorithm, please, read +// https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerAlgorithm. +// +// When inline assembly is parsed by an instance of X86AsmParser, all +// instructions are emitted via EmitInstruction method. That's the +// place where X86AsmInstrumentation analyzes an instruction and +// decides, whether the instruction should be emitted as is or +// instrumentation is required. The latter case happens when an +// instruction reads from or writes to memory. Now instruction opcode +// is explicitly checked, and if an instruction has a memory operand +// (for instance, movq (%rsi, %rcx, 8), %rax) - it should be +// instrumented. There're also exist instructions that modify +// memory but don't have an explicit memory operands, for instance, +// movs. +// +// Let's consider at first 8-byte memory accesses when an instruction +// has an explicit memory operand. In this case we need two registers - +// AddressReg to compute address of a memory cells which are accessed +// and ShadowReg to compute corresponding shadow address. So, we need +// to spill both registers before instrumentation code and restore them +// after instrumentation. Thus, in general, instrumentation code will +// look like this: +// PUSHF # Store flags, otherwise they will be overwritten +// PUSH AddressReg # spill AddressReg +// PUSH ShadowReg # spill ShadowReg +// LEA MemOp, AddressReg # compute address of the memory operand +// MOV AddressReg, ShadowReg +// SHR ShadowReg, 3 +// # ShadowOffset(AddressReg >> 3) contains address of a shadow +// # corresponding to MemOp. +// CMP ShadowOffset(ShadowReg), 0 # test shadow value +// JZ .Done # when shadow equals to zero, everything is fine +// MOV AddressReg, RDI +// # Call __asan_report function with AddressReg as an argument +// CALL __asan_report +// .Done: +// POP ShadowReg # Restore ShadowReg +// POP AddressReg # Restore AddressReg +// POPF # Restore flags +// +// Memory accesses with different size (1-, 2-, 4- and 16-byte) are +// handled in a similar manner, but small memory accesses (less than 8 +// byte) require an additional ScratchReg, which is used for shadow value. +// +// If, suppose, we're instrumenting an instruction like movs, only +// contents of RDI, RDI + AccessSize * RCX, RSI, RSI + AccessSize * +// RCX are checked. In this case there're no need to spill and restore +// AddressReg , ShadowReg or flags four times, they're saved on stack +// just once, before instrumentation of these four addresses, and restored +// at the end of the instrumentation. +// +// There exist several things which complicate this simple algorithm. +// * Instrumented memory operand can have RSP as a base or an index +// register. So we need to add a constant offset before computation +// of memory address, since flags, AddressReg, ShadowReg, etc. were +// already stored on stack and RSP was modified. +// * Debug info (usually, DWARF) should be adjusted, because sometimes +// RSP is used as a frame register. So, we need to select some +// register as a frame register and temprorary override current CFA +// register. + +namespace llvm { +namespace { + +static cl::opt<bool> ClAsanInstrumentAssembly( + "asan-instrument-assembly", + cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden, + cl::init(false)); + +const int64_t MinAllowedDisplacement = std::numeric_limits<int32_t>::min(); +const int64_t MaxAllowedDisplacement = std::numeric_limits<int32_t>::max(); + +int64_t ApplyDisplacementBounds(int64_t Displacement) { + return std::max(std::min(MaxAllowedDisplacement, Displacement), + MinAllowedDisplacement); +} + +void CheckDisplacementBounds(int64_t Displacement) { + assert(Displacement >= MinAllowedDisplacement && + Displacement <= MaxAllowedDisplacement); +} + +bool IsStackReg(unsigned Reg) { return Reg == X86::RSP || Reg == X86::ESP; } + +bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; } + +class X86AddressSanitizer : public X86AsmInstrumentation { +public: + struct RegisterContext { + private: + enum RegOffset { + REG_OFFSET_ADDRESS = 0, + REG_OFFSET_SHADOW, + REG_OFFSET_SCRATCH + }; + + public: + RegisterContext(unsigned AddressReg, unsigned ShadowReg, + unsigned ScratchReg) { + BusyRegs.push_back(convReg(AddressReg, 64)); + BusyRegs.push_back(convReg(ShadowReg, 64)); + BusyRegs.push_back(convReg(ScratchReg, 64)); + } + + unsigned AddressReg(unsigned Size) const { + return convReg(BusyRegs[REG_OFFSET_ADDRESS], Size); + } + + unsigned ShadowReg(unsigned Size) const { + return convReg(BusyRegs[REG_OFFSET_SHADOW], Size); + } + + unsigned ScratchReg(unsigned Size) const { + return convReg(BusyRegs[REG_OFFSET_SCRATCH], Size); + } + + void AddBusyReg(unsigned Reg) { + if (Reg != X86::NoRegister) + BusyRegs.push_back(convReg(Reg, 64)); + } + + void AddBusyRegs(const X86Operand &Op) { + AddBusyReg(Op.getMemBaseReg()); + AddBusyReg(Op.getMemIndexReg()); + } + + unsigned ChooseFrameReg(unsigned Size) const { + static const MCPhysReg Candidates[] = { X86::RBP, X86::RAX, X86::RBX, + X86::RCX, X86::RDX, X86::RDI, + X86::RSI }; + for (unsigned Reg : Candidates) { + if (!std::count(BusyRegs.begin(), BusyRegs.end(), Reg)) + return convReg(Reg, Size); + } + return X86::NoRegister; + } + + private: + unsigned convReg(unsigned Reg, unsigned Size) const { + return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, Size); + } + + std::vector<unsigned> BusyRegs; + }; + + X86AddressSanitizer(const MCSubtargetInfo *&STI) + : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {} + + ~X86AddressSanitizer() override {} + + // X86AsmInstrumentation implementation: + void InstrumentAndEmitInstruction(const MCInst &Inst, + OperandVector &Operands, + MCContext &Ctx, + const MCInstrInfo &MII, + MCStreamer &Out) override { + InstrumentMOVS(Inst, Operands, Ctx, MII, Out); + if (RepPrefix) + EmitInstruction(Out, MCInstBuilder(X86::REP_PREFIX)); + + InstrumentMOV(Inst, Operands, Ctx, MII, Out); + + RepPrefix = (Inst.getOpcode() == X86::REP_PREFIX); + if (!RepPrefix) + EmitInstruction(Out, Inst); + } + + // Adjusts up stack and saves all registers used in instrumentation. + virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) = 0; + + // Restores all registers used in instrumentation and adjusts stack. + virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) = 0; + + virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, MCStreamer &Out) = 0; + virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, MCStreamer &Out) = 0; + + virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, + MCStreamer &Out) = 0; + + void InstrumentMemOperand(X86Operand &Op, unsigned AccessSize, bool IsWrite, + const RegisterContext &RegCtx, MCContext &Ctx, + MCStreamer &Out); + void InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, unsigned CntReg, + unsigned AccessSize, MCContext &Ctx, MCStreamer &Out); + + void InstrumentMOVS(const MCInst &Inst, OperandVector &Operands, + MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); + void InstrumentMOV(const MCInst &Inst, OperandVector &Operands, + MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); + +protected: + void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); } + + void EmitLEA(X86Operand &Op, unsigned Size, unsigned Reg, MCStreamer &Out) { + assert(Size == 32 || Size == 64); + MCInst Inst; + Inst.setOpcode(Size == 32 ? X86::LEA32r : X86::LEA64r); + Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, Size))); + Op.addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + } + + void ComputeMemOperandAddress(X86Operand &Op, unsigned Size, + unsigned Reg, MCContext &Ctx, MCStreamer &Out); + + // Creates new memory operand with Displacement added to an original + // displacement. Residue will contain a residue which could happen when the + // total displacement exceeds 32-bit limitation. + std::unique_ptr<X86Operand> AddDisplacement(X86Operand &Op, + int64_t Displacement, + MCContext &Ctx, int64_t *Residue); + + bool is64BitMode() const { + return STI->getFeatureBits()[X86::Mode64Bit]; + } + bool is32BitMode() const { + return STI->getFeatureBits()[X86::Mode32Bit]; + } + bool is16BitMode() const { + return STI->getFeatureBits()[X86::Mode16Bit]; + } + + unsigned getPointerWidth() { + if (is16BitMode()) return 16; + if (is32BitMode()) return 32; + if (is64BitMode()) return 64; + llvm_unreachable("invalid mode"); + } + + // True when previous instruction was actually REP prefix. + bool RepPrefix; + + // Offset from the original SP register. + int64_t OrigSPOffset; +}; + +void X86AddressSanitizer::InstrumentMemOperand( + X86Operand &Op, unsigned AccessSize, bool IsWrite, + const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { + assert(Op.isMem() && "Op should be a memory operand."); + assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 && + "AccessSize should be a power of two, less or equal than 16."); + // FIXME: take into account load/store alignment. + if (IsSmallMemAccess(AccessSize)) + InstrumentMemOperandSmall(Op, AccessSize, IsWrite, RegCtx, Ctx, Out); + else + InstrumentMemOperandLarge(Op, AccessSize, IsWrite, RegCtx, Ctx, Out); +} + +void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, + unsigned CntReg, + unsigned AccessSize, + MCContext &Ctx, MCStreamer &Out) { + // FIXME: check whole ranges [DstReg .. DstReg + AccessSize * (CntReg - 1)] + // and [SrcReg .. SrcReg + AccessSize * (CntReg - 1)]. + RegisterContext RegCtx(X86::RDX /* AddressReg */, X86::RAX /* ShadowReg */, + IsSmallMemAccess(AccessSize) + ? X86::RBX + : X86::NoRegister /* ScratchReg */); + RegCtx.AddBusyReg(DstReg); + RegCtx.AddBusyReg(SrcReg); + RegCtx.AddBusyReg(CntReg); + + InstrumentMemOperandPrologue(RegCtx, Ctx, Out); + + // Test (%SrcReg) + { + const MCExpr *Disp = MCConstantExpr::create(0, Ctx); + std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( + getPointerWidth(), 0, Disp, SrcReg, 0, AccessSize, SMLoc(), SMLoc())); + InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx, + Out); + } + + // Test -1(%SrcReg, %CntReg, AccessSize) + { + const MCExpr *Disp = MCConstantExpr::create(-1, Ctx); + std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( + getPointerWidth(), 0, Disp, SrcReg, CntReg, AccessSize, SMLoc(), + SMLoc())); + InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx, + Out); + } + + // Test (%DstReg) + { + const MCExpr *Disp = MCConstantExpr::create(0, Ctx); + std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( + getPointerWidth(), 0, Disp, DstReg, 0, AccessSize, SMLoc(), SMLoc())); + InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out); + } + + // Test -1(%DstReg, %CntReg, AccessSize) + { + const MCExpr *Disp = MCConstantExpr::create(-1, Ctx); + std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( + getPointerWidth(), 0, Disp, DstReg, CntReg, AccessSize, SMLoc(), + SMLoc())); + InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out); + } + + InstrumentMemOperandEpilogue(RegCtx, Ctx, Out); +} + +void X86AddressSanitizer::InstrumentMOVS(const MCInst &Inst, + OperandVector &Operands, + MCContext &Ctx, const MCInstrInfo &MII, + MCStreamer &Out) { + // Access size in bytes. + unsigned AccessSize = 0; + + switch (Inst.getOpcode()) { + case X86::MOVSB: + AccessSize = 1; + break; + case X86::MOVSW: + AccessSize = 2; + break; + case X86::MOVSL: + AccessSize = 4; + break; + case X86::MOVSQ: + AccessSize = 8; + break; + default: + return; + } + + InstrumentMOVSImpl(AccessSize, Ctx, Out); +} + +void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst, + OperandVector &Operands, MCContext &Ctx, + const MCInstrInfo &MII, + MCStreamer &Out) { + // Access size in bytes. + unsigned AccessSize = 0; + + switch (Inst.getOpcode()) { + case X86::MOV8mi: + case X86::MOV8mr: + case X86::MOV8rm: + AccessSize = 1; + break; + case X86::MOV16mi: + case X86::MOV16mr: + case X86::MOV16rm: + AccessSize = 2; + break; + case X86::MOV32mi: + case X86::MOV32mr: + case X86::MOV32rm: + AccessSize = 4; + break; + case X86::MOV64mi32: + case X86::MOV64mr: + case X86::MOV64rm: + AccessSize = 8; + break; + case X86::MOVAPDmr: + case X86::MOVAPSmr: + case X86::MOVAPDrm: + case X86::MOVAPSrm: + AccessSize = 16; + break; + default: + return; + } + + const bool IsWrite = MII.get(Inst.getOpcode()).mayStore(); + + for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) { + assert(Operands[Ix]); + MCParsedAsmOperand &Op = *Operands[Ix]; + if (Op.isMem()) { + X86Operand &MemOp = static_cast<X86Operand &>(Op); + RegisterContext RegCtx( + X86::RDI /* AddressReg */, X86::RAX /* ShadowReg */, + IsSmallMemAccess(AccessSize) ? X86::RCX + : X86::NoRegister /* ScratchReg */); + RegCtx.AddBusyRegs(MemOp); + InstrumentMemOperandPrologue(RegCtx, Ctx, Out); + InstrumentMemOperand(MemOp, AccessSize, IsWrite, RegCtx, Ctx, Out); + InstrumentMemOperandEpilogue(RegCtx, Ctx, Out); + } + } +} + +void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op, + unsigned Size, + unsigned Reg, MCContext &Ctx, + MCStreamer &Out) { + int64_t Displacement = 0; + if (IsStackReg(Op.getMemBaseReg())) + Displacement -= OrigSPOffset; + if (IsStackReg(Op.getMemIndexReg())) + Displacement -= OrigSPOffset * Op.getMemScale(); + + assert(Displacement >= 0); + + // Emit Op as is. + if (Displacement == 0) { + EmitLEA(Op, Size, Reg, Out); + return; + } + + int64_t Residue; + std::unique_ptr<X86Operand> NewOp = + AddDisplacement(Op, Displacement, Ctx, &Residue); + EmitLEA(*NewOp, Size, Reg, Out); + + while (Residue != 0) { + const MCConstantExpr *Disp = + MCConstantExpr::create(ApplyDisplacementBounds(Residue), Ctx); + std::unique_ptr<X86Operand> DispOp = + X86Operand::CreateMem(getPointerWidth(), 0, Disp, Reg, 0, 1, SMLoc(), + SMLoc()); + EmitLEA(*DispOp, Size, Reg, Out); + Residue -= Disp->getValue(); + } +} + +std::unique_ptr<X86Operand> +X86AddressSanitizer::AddDisplacement(X86Operand &Op, int64_t Displacement, + MCContext &Ctx, int64_t *Residue) { + assert(Displacement >= 0); + + if (Displacement == 0 || + (Op.getMemDisp() && Op.getMemDisp()->getKind() != MCExpr::Constant)) { + *Residue = Displacement; + return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(), + Op.getMemDisp(), Op.getMemBaseReg(), + Op.getMemIndexReg(), Op.getMemScale(), + SMLoc(), SMLoc()); + } + + int64_t OrigDisplacement = + static_cast<const MCConstantExpr *>(Op.getMemDisp())->getValue(); + CheckDisplacementBounds(OrigDisplacement); + Displacement += OrigDisplacement; + + int64_t NewDisplacement = ApplyDisplacementBounds(Displacement); + CheckDisplacementBounds(NewDisplacement); + + *Residue = Displacement - NewDisplacement; + const MCExpr *Disp = MCConstantExpr::create(NewDisplacement, Ctx); + return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(), Disp, + Op.getMemBaseReg(), Op.getMemIndexReg(), + Op.getMemScale(), SMLoc(), SMLoc()); +} + +class X86AddressSanitizer32 : public X86AddressSanitizer { +public: + static const long kShadowOffset = 0x20000000; + + X86AddressSanitizer32(const MCSubtargetInfo *&STI) + : X86AddressSanitizer(STI) {} + + ~X86AddressSanitizer32() override {} + + unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) { + unsigned FrameReg = GetFrameRegGeneric(Ctx, Out); + if (FrameReg == X86::NoRegister) + return FrameReg; + return getX86SubSuperRegister(FrameReg, 32); + } + + void SpillReg(MCStreamer &Out, unsigned Reg) { + EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(Reg)); + OrigSPOffset -= 4; + } + + void RestoreReg(MCStreamer &Out, unsigned Reg) { + EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(Reg)); + OrigSPOffset += 4; + } + + void StoreFlags(MCStreamer &Out) { + EmitInstruction(Out, MCInstBuilder(X86::PUSHF32)); + OrigSPOffset -= 4; + } + + void RestoreFlags(MCStreamer &Out) { + EmitInstruction(Out, MCInstBuilder(X86::POPF32)); + OrigSPOffset += 4; + } + + void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32); + assert(LocalFrameReg != X86::NoRegister); + + const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); + unsigned FrameReg = GetFrameReg(Ctx, Out); + if (MRI && FrameReg != X86::NoRegister) { + SpillReg(Out, LocalFrameReg); + if (FrameReg == X86::ESP) { + Out.EmitCFIAdjustCfaOffset(4 /* byte size of the LocalFrameReg */); + Out.EmitCFIRelOffset( + MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0); + } + EmitInstruction( + Out, + MCInstBuilder(X86::MOV32rr).addReg(LocalFrameReg).addReg(FrameReg)); + Out.EmitCFIRememberState(); + Out.EmitCFIDefCfaRegister( + MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */)); + } + + SpillReg(Out, RegCtx.AddressReg(32)); + SpillReg(Out, RegCtx.ShadowReg(32)); + if (RegCtx.ScratchReg(32) != X86::NoRegister) + SpillReg(Out, RegCtx.ScratchReg(32)); + StoreFlags(Out); + } + + void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32); + assert(LocalFrameReg != X86::NoRegister); + + RestoreFlags(Out); + if (RegCtx.ScratchReg(32) != X86::NoRegister) + RestoreReg(Out, RegCtx.ScratchReg(32)); + RestoreReg(Out, RegCtx.ShadowReg(32)); + RestoreReg(Out, RegCtx.AddressReg(32)); + + unsigned FrameReg = GetFrameReg(Ctx, Out); + if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) { + RestoreReg(Out, LocalFrameReg); + Out.EmitCFIRestoreState(); + if (FrameReg == X86::ESP) + Out.EmitCFIAdjustCfaOffset(-4 /* byte size of the LocalFrameReg */); + } + } + + void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, + MCStreamer &Out) override; + +private: + void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out, const RegisterContext &RegCtx) { + EmitInstruction(Out, MCInstBuilder(X86::CLD)); + EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS)); + + EmitInstruction(Out, MCInstBuilder(X86::AND64ri8) + .addReg(X86::ESP) + .addReg(X86::ESP) + .addImm(-16)); + EmitInstruction( + Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(32))); + + MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") + + (IsWrite ? "store" : "load") + + llvm::Twine(AccessSize)); + const MCSymbolRefExpr *FnExpr = + MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr)); + } +}; + +void X86AddressSanitizer32::InstrumentMemOperandSmall( + X86Operand &Op, unsigned AccessSize, bool IsWrite, + const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { + unsigned AddressRegI32 = RegCtx.AddressReg(32); + unsigned ShadowRegI32 = RegCtx.ShadowReg(32); + unsigned ShadowRegI8 = RegCtx.ShadowReg(8); + + assert(RegCtx.ScratchReg(32) != X86::NoRegister); + unsigned ScratchRegI32 = RegCtx.ScratchReg(32); + + ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out); + + EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg( + AddressRegI32)); + EmitInstruction(Out, MCInstBuilder(X86::SHR32ri) + .addReg(ShadowRegI32) + .addReg(ShadowRegI32) + .addImm(3)); + + { + MCInst Inst; + Inst.setOpcode(X86::MOV8rm); + Inst.addOperand(MCOperand::createReg(ShadowRegI8)); + const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx); + std::unique_ptr<X86Operand> Op( + X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1, + SMLoc(), SMLoc())); + Op->addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + } + + EmitInstruction( + Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8)); + MCSymbol *DoneSym = Ctx.createTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); + + EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg( + AddressRegI32)); + EmitInstruction(Out, MCInstBuilder(X86::AND32ri) + .addReg(ScratchRegI32) + .addReg(ScratchRegI32) + .addImm(7)); + + switch (AccessSize) { + default: llvm_unreachable("Incorrect access size"); + case 1: + break; + case 2: { + const MCExpr *Disp = MCConstantExpr::create(1, Ctx); + std::unique_ptr<X86Operand> Op( + X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1, + SMLoc(), SMLoc())); + EmitLEA(*Op, 32, ScratchRegI32, Out); + break; + } + case 4: + EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8) + .addReg(ScratchRegI32) + .addReg(ScratchRegI32) + .addImm(3)); + break; + } + + EmitInstruction( + Out, + MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8)); + EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg( + ShadowRegI32)); + EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr)); + + EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); + EmitLabel(Out, DoneSym); +} + +void X86AddressSanitizer32::InstrumentMemOperandLarge( + X86Operand &Op, unsigned AccessSize, bool IsWrite, + const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { + unsigned AddressRegI32 = RegCtx.AddressReg(32); + unsigned ShadowRegI32 = RegCtx.ShadowReg(32); + + ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out); + + EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg( + AddressRegI32)); + EmitInstruction(Out, MCInstBuilder(X86::SHR32ri) + .addReg(ShadowRegI32) + .addReg(ShadowRegI32) + .addImm(3)); + { + MCInst Inst; + switch (AccessSize) { + default: llvm_unreachable("Incorrect access size"); + case 8: + Inst.setOpcode(X86::CMP8mi); + break; + case 16: + Inst.setOpcode(X86::CMP16mi); + break; + } + const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx); + std::unique_ptr<X86Operand> Op( + X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1, + SMLoc(), SMLoc())); + Op->addMemOperands(Inst, 5); + Inst.addOperand(MCOperand::createImm(0)); + EmitInstruction(Out, Inst); + } + MCSymbol *DoneSym = Ctx.createTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); + + EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); + EmitLabel(Out, DoneSym); +} + +void X86AddressSanitizer32::InstrumentMOVSImpl(unsigned AccessSize, + MCContext &Ctx, + MCStreamer &Out) { + StoreFlags(Out); + + // No need to test when ECX is equals to zero. + MCSymbol *DoneSym = Ctx.createTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx); + EmitInstruction( + Out, MCInstBuilder(X86::TEST32rr).addReg(X86::ECX).addReg(X86::ECX)); + EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); + + // Instrument first and last elements in src and dst range. + InstrumentMOVSBase(X86::EDI /* DstReg */, X86::ESI /* SrcReg */, + X86::ECX /* CntReg */, AccessSize, Ctx, Out); + + EmitLabel(Out, DoneSym); + RestoreFlags(Out); +} + +class X86AddressSanitizer64 : public X86AddressSanitizer { +public: + static const long kShadowOffset = 0x7fff8000; + + X86AddressSanitizer64(const MCSubtargetInfo *&STI) + : X86AddressSanitizer(STI) {} + + ~X86AddressSanitizer64() override {} + + unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) { + unsigned FrameReg = GetFrameRegGeneric(Ctx, Out); + if (FrameReg == X86::NoRegister) + return FrameReg; + return getX86SubSuperRegister(FrameReg, 64); + } + + void SpillReg(MCStreamer &Out, unsigned Reg) { + EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(Reg)); + OrigSPOffset -= 8; + } + + void RestoreReg(MCStreamer &Out, unsigned Reg) { + EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(Reg)); + OrigSPOffset += 8; + } + + void StoreFlags(MCStreamer &Out) { + EmitInstruction(Out, MCInstBuilder(X86::PUSHF64)); + OrigSPOffset -= 8; + } + + void RestoreFlags(MCStreamer &Out) { + EmitInstruction(Out, MCInstBuilder(X86::POPF64)); + OrigSPOffset += 8; + } + + void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64); + assert(LocalFrameReg != X86::NoRegister); + + const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); + unsigned FrameReg = GetFrameReg(Ctx, Out); + if (MRI && FrameReg != X86::NoRegister) { + SpillReg(Out, X86::RBP); + if (FrameReg == X86::RSP) { + Out.EmitCFIAdjustCfaOffset(8 /* byte size of the LocalFrameReg */); + Out.EmitCFIRelOffset( + MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0); + } + EmitInstruction( + Out, + MCInstBuilder(X86::MOV64rr).addReg(LocalFrameReg).addReg(FrameReg)); + Out.EmitCFIRememberState(); + Out.EmitCFIDefCfaRegister( + MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */)); + } + + EmitAdjustRSP(Ctx, Out, -128); + SpillReg(Out, RegCtx.ShadowReg(64)); + SpillReg(Out, RegCtx.AddressReg(64)); + if (RegCtx.ScratchReg(64) != X86::NoRegister) + SpillReg(Out, RegCtx.ScratchReg(64)); + StoreFlags(Out); + } + + void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64); + assert(LocalFrameReg != X86::NoRegister); + + RestoreFlags(Out); + if (RegCtx.ScratchReg(64) != X86::NoRegister) + RestoreReg(Out, RegCtx.ScratchReg(64)); + RestoreReg(Out, RegCtx.AddressReg(64)); + RestoreReg(Out, RegCtx.ShadowReg(64)); + EmitAdjustRSP(Ctx, Out, 128); + + unsigned FrameReg = GetFrameReg(Ctx, Out); + if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) { + RestoreReg(Out, LocalFrameReg); + Out.EmitCFIRestoreState(); + if (FrameReg == X86::RSP) + Out.EmitCFIAdjustCfaOffset(-8 /* byte size of the LocalFrameReg */); + } + } + + void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, + MCStreamer &Out) override; + +private: + void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) { + const MCExpr *Disp = MCConstantExpr::create(Offset, Ctx); + std::unique_ptr<X86Operand> Op( + X86Operand::CreateMem(getPointerWidth(), 0, Disp, X86::RSP, 0, 1, + SMLoc(), SMLoc())); + EmitLEA(*Op, 64, X86::RSP, Out); + OrigSPOffset += Offset; + } + + void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out, const RegisterContext &RegCtx) { + EmitInstruction(Out, MCInstBuilder(X86::CLD)); + EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS)); + + EmitInstruction(Out, MCInstBuilder(X86::AND64ri8) + .addReg(X86::RSP) + .addReg(X86::RSP) + .addImm(-16)); + + if (RegCtx.AddressReg(64) != X86::RDI) { + EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg( + RegCtx.AddressReg(64))); + } + MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") + + (IsWrite ? "store" : "load") + + llvm::Twine(AccessSize)); + const MCSymbolRefExpr *FnExpr = + MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr)); + } +}; + +void X86AddressSanitizer64::InstrumentMemOperandSmall( + X86Operand &Op, unsigned AccessSize, bool IsWrite, + const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { + unsigned AddressRegI64 = RegCtx.AddressReg(64); + unsigned AddressRegI32 = RegCtx.AddressReg(32); + unsigned ShadowRegI64 = RegCtx.ShadowReg(64); + unsigned ShadowRegI32 = RegCtx.ShadowReg(32); + unsigned ShadowRegI8 = RegCtx.ShadowReg(8); + + assert(RegCtx.ScratchReg(32) != X86::NoRegister); + unsigned ScratchRegI32 = RegCtx.ScratchReg(32); + + ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out); + + EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg( + AddressRegI64)); + EmitInstruction(Out, MCInstBuilder(X86::SHR64ri) + .addReg(ShadowRegI64) + .addReg(ShadowRegI64) + .addImm(3)); + { + MCInst Inst; + Inst.setOpcode(X86::MOV8rm); + Inst.addOperand(MCOperand::createReg(ShadowRegI8)); + const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx); + std::unique_ptr<X86Operand> Op( + X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1, + SMLoc(), SMLoc())); + Op->addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + } + + EmitInstruction( + Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8)); + MCSymbol *DoneSym = Ctx.createTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); + + EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg( + AddressRegI32)); + EmitInstruction(Out, MCInstBuilder(X86::AND32ri) + .addReg(ScratchRegI32) + .addReg(ScratchRegI32) + .addImm(7)); + + switch (AccessSize) { + default: llvm_unreachable("Incorrect access size"); + case 1: + break; + case 2: { + const MCExpr *Disp = MCConstantExpr::create(1, Ctx); + std::unique_ptr<X86Operand> Op( + X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1, + SMLoc(), SMLoc())); + EmitLEA(*Op, 32, ScratchRegI32, Out); + break; + } + case 4: + EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8) + .addReg(ScratchRegI32) + .addReg(ScratchRegI32) + .addImm(3)); + break; + } + + EmitInstruction( + Out, + MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8)); + EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg( + ShadowRegI32)); + EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr)); + + EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); + EmitLabel(Out, DoneSym); +} + +void X86AddressSanitizer64::InstrumentMemOperandLarge( + X86Operand &Op, unsigned AccessSize, bool IsWrite, + const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { + unsigned AddressRegI64 = RegCtx.AddressReg(64); + unsigned ShadowRegI64 = RegCtx.ShadowReg(64); + + ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out); + + EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg( + AddressRegI64)); + EmitInstruction(Out, MCInstBuilder(X86::SHR64ri) + .addReg(ShadowRegI64) + .addReg(ShadowRegI64) + .addImm(3)); + { + MCInst Inst; + switch (AccessSize) { + default: llvm_unreachable("Incorrect access size"); + case 8: + Inst.setOpcode(X86::CMP8mi); + break; + case 16: + Inst.setOpcode(X86::CMP16mi); + break; + } + const MCExpr *Disp = MCConstantExpr::create(kShadowOffset, Ctx); + std::unique_ptr<X86Operand> Op( + X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1, + SMLoc(), SMLoc())); + Op->addMemOperands(Inst, 5); + Inst.addOperand(MCOperand::createImm(0)); + EmitInstruction(Out, Inst); + } + + MCSymbol *DoneSym = Ctx.createTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx); + EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); + + EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); + EmitLabel(Out, DoneSym); +} + +void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize, + MCContext &Ctx, + MCStreamer &Out) { + StoreFlags(Out); + + // No need to test when RCX is equals to zero. + MCSymbol *DoneSym = Ctx.createTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::create(DoneSym, Ctx); + EmitInstruction( + Out, MCInstBuilder(X86::TEST64rr).addReg(X86::RCX).addReg(X86::RCX)); + EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); + + // Instrument first and last elements in src and dst range. + InstrumentMOVSBase(X86::RDI /* DstReg */, X86::RSI /* SrcReg */, + X86::RCX /* CntReg */, AccessSize, Ctx, Out); + + EmitLabel(Out, DoneSym); + RestoreFlags(Out); +} + +} // End anonymous namespace + +X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo *&STI) + : STI(STI), InitialFrameReg(0) {} + +X86AsmInstrumentation::~X86AsmInstrumentation() {} + +void X86AsmInstrumentation::InstrumentAndEmitInstruction( + const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, + const MCInstrInfo &MII, MCStreamer &Out) { + EmitInstruction(Out, Inst); +} + +void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out, + const MCInst &Inst) { + Out.EmitInstruction(Inst, *STI); +} + +unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx, + MCStreamer &Out) { + if (!Out.getNumFrameInfos()) // No active dwarf frame + return X86::NoRegister; + const MCDwarfFrameInfo &Frame = Out.getDwarfFrameInfos().back(); + if (Frame.End) // Active dwarf frame is closed + return X86::NoRegister; + const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); + if (!MRI) // No register info + return X86::NoRegister; + + if (InitialFrameReg) { + // FrameReg is set explicitly, we're instrumenting a MachineFunction. + return InitialFrameReg; + } + + return MRI->getLLVMRegNum(Frame.CurrentCfaRegister, true /* IsEH */); +} + +X86AsmInstrumentation * +CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, + const MCContext &Ctx, const MCSubtargetInfo *&STI) { + Triple T(STI->getTargetTriple()); + const bool hasCompilerRTSupport = T.isOSLinux(); + if (ClAsanInstrumentAssembly && hasCompilerRTSupport && + MCOptions.SanitizeAddress) { + if (STI->getFeatureBits()[X86::Mode32Bit] != 0) + return new X86AddressSanitizer32(STI); + if (STI->getFeatureBits()[X86::Mode64Bit] != 0) + return new X86AddressSanitizer64(STI); + } + return new X86AsmInstrumentation(STI); +} + +} // end llvm namespace diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h new file mode 100644 index 0000000..470cead --- /dev/null +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h @@ -0,0 +1,68 @@ +//===- X86AsmInstrumentation.h - Instrument X86 inline assembly *- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H +#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H + +#include "llvm/ADT/SmallVector.h" + +#include <memory> + +namespace llvm { + +class MCContext; +class MCInst; +class MCInstrInfo; +class MCParsedAsmOperand; +class MCStreamer; +class MCSubtargetInfo; +class MCTargetOptions; + +class X86AsmInstrumentation; + +X86AsmInstrumentation * +CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, + const MCContext &Ctx, + const MCSubtargetInfo *&STI); + +class X86AsmInstrumentation { +public: + virtual ~X86AsmInstrumentation(); + + // Sets frame register corresponding to a current frame. + void SetInitialFrameRegister(unsigned RegNo) { + InitialFrameReg = RegNo; + } + + // Tries to instrument and emit instruction. + virtual void InstrumentAndEmitInstruction( + const MCInst &Inst, + SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand> > &Operands, + MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); + +protected: + friend X86AsmInstrumentation * + CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, + const MCContext &Ctx, + const MCSubtargetInfo *&STI); + + X86AsmInstrumentation(const MCSubtargetInfo *&STI); + + unsigned GetFrameRegGeneric(const MCContext &Ctx, MCStreamer &Out); + + void EmitInstruction(MCStreamer &Out, const MCInst &Inst); + + const MCSubtargetInfo *&STI; + + unsigned InitialFrameReg; +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp new file mode 100644 index 0000000..4d8ffac --- /dev/null +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -0,0 +1,2951 @@ +//===-- X86AsmParser.cpp - Parse X86 assembly to MCInst instructions ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86BaseInfo.h" +#include "X86AsmInstrumentation.h" +#include "X86AsmParserCommon.h" +#include "X86Operand.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSection.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <memory> + +using namespace llvm; + +namespace { + +static const char OpPrecedence[] = { + 0, // IC_OR + 1, // IC_XOR + 2, // IC_AND + 3, // IC_LSHIFT + 3, // IC_RSHIFT + 4, // IC_PLUS + 4, // IC_MINUS + 5, // IC_MULTIPLY + 5, // IC_DIVIDE + 6, // IC_RPAREN + 7, // IC_LPAREN + 0, // IC_IMM + 0 // IC_REGISTER +}; + +class X86AsmParser : public MCTargetAsmParser { + const MCInstrInfo &MII; + ParseInstructionInfo *InstInfo; + std::unique_ptr<X86AsmInstrumentation> Instrumentation; + +private: + SMLoc consumeToken() { + MCAsmParser &Parser = getParser(); + SMLoc Result = Parser.getTok().getLoc(); + Parser.Lex(); + return Result; + } + + enum InfixCalculatorTok { + IC_OR = 0, + IC_XOR, + IC_AND, + IC_LSHIFT, + IC_RSHIFT, + IC_PLUS, + IC_MINUS, + IC_MULTIPLY, + IC_DIVIDE, + IC_RPAREN, + IC_LPAREN, + IC_IMM, + IC_REGISTER + }; + + class InfixCalculator { + typedef std::pair< InfixCalculatorTok, int64_t > ICToken; + SmallVector<InfixCalculatorTok, 4> InfixOperatorStack; + SmallVector<ICToken, 4> PostfixStack; + + public: + int64_t popOperand() { + assert (!PostfixStack.empty() && "Poped an empty stack!"); + ICToken Op = PostfixStack.pop_back_val(); + assert ((Op.first == IC_IMM || Op.first == IC_REGISTER) + && "Expected and immediate or register!"); + return Op.second; + } + void pushOperand(InfixCalculatorTok Op, int64_t Val = 0) { + assert ((Op == IC_IMM || Op == IC_REGISTER) && + "Unexpected operand!"); + PostfixStack.push_back(std::make_pair(Op, Val)); + } + + void popOperator() { InfixOperatorStack.pop_back(); } + void pushOperator(InfixCalculatorTok Op) { + // Push the new operator if the stack is empty. + if (InfixOperatorStack.empty()) { + InfixOperatorStack.push_back(Op); + return; + } + + // Push the new operator if it has a higher precedence than the operator + // on the top of the stack or the operator on the top of the stack is a + // left parentheses. + unsigned Idx = InfixOperatorStack.size() - 1; + InfixCalculatorTok StackOp = InfixOperatorStack[Idx]; + if (OpPrecedence[Op] > OpPrecedence[StackOp] || StackOp == IC_LPAREN) { + InfixOperatorStack.push_back(Op); + return; + } + + // The operator on the top of the stack has higher precedence than the + // new operator. + unsigned ParenCount = 0; + while (1) { + // Nothing to process. + if (InfixOperatorStack.empty()) + break; + + Idx = InfixOperatorStack.size() - 1; + StackOp = InfixOperatorStack[Idx]; + if (!(OpPrecedence[StackOp] >= OpPrecedence[Op] || ParenCount)) + break; + + // If we have an even parentheses count and we see a left parentheses, + // then stop processing. + if (!ParenCount && StackOp == IC_LPAREN) + break; + + if (StackOp == IC_RPAREN) { + ++ParenCount; + InfixOperatorStack.pop_back(); + } else if (StackOp == IC_LPAREN) { + --ParenCount; + InfixOperatorStack.pop_back(); + } else { + InfixOperatorStack.pop_back(); + PostfixStack.push_back(std::make_pair(StackOp, 0)); + } + } + // Push the new operator. + InfixOperatorStack.push_back(Op); + } + + int64_t execute() { + // Push any remaining operators onto the postfix stack. + while (!InfixOperatorStack.empty()) { + InfixCalculatorTok StackOp = InfixOperatorStack.pop_back_val(); + if (StackOp != IC_LPAREN && StackOp != IC_RPAREN) + PostfixStack.push_back(std::make_pair(StackOp, 0)); + } + + if (PostfixStack.empty()) + return 0; + + SmallVector<ICToken, 16> OperandStack; + for (unsigned i = 0, e = PostfixStack.size(); i != e; ++i) { + ICToken Op = PostfixStack[i]; + if (Op.first == IC_IMM || Op.first == IC_REGISTER) { + OperandStack.push_back(Op); + } else { + assert (OperandStack.size() > 1 && "Too few operands."); + int64_t Val; + ICToken Op2 = OperandStack.pop_back_val(); + ICToken Op1 = OperandStack.pop_back_val(); + switch (Op.first) { + default: + report_fatal_error("Unexpected operator!"); + break; + case IC_PLUS: + Val = Op1.second + Op2.second; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + case IC_MINUS: + Val = Op1.second - Op2.second; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + case IC_MULTIPLY: + assert (Op1.first == IC_IMM && Op2.first == IC_IMM && + "Multiply operation with an immediate and a register!"); + Val = Op1.second * Op2.second; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + case IC_DIVIDE: + assert (Op1.first == IC_IMM && Op2.first == IC_IMM && + "Divide operation with an immediate and a register!"); + assert (Op2.second != 0 && "Division by zero!"); + Val = Op1.second / Op2.second; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + case IC_OR: + assert (Op1.first == IC_IMM && Op2.first == IC_IMM && + "Or operation with an immediate and a register!"); + Val = Op1.second | Op2.second; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + case IC_XOR: + assert(Op1.first == IC_IMM && Op2.first == IC_IMM && + "Xor operation with an immediate and a register!"); + Val = Op1.second ^ Op2.second; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + case IC_AND: + assert (Op1.first == IC_IMM && Op2.first == IC_IMM && + "And operation with an immediate and a register!"); + Val = Op1.second & Op2.second; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + case IC_LSHIFT: + assert (Op1.first == IC_IMM && Op2.first == IC_IMM && + "Left shift operation with an immediate and a register!"); + Val = Op1.second << Op2.second; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + case IC_RSHIFT: + assert (Op1.first == IC_IMM && Op2.first == IC_IMM && + "Right shift operation with an immediate and a register!"); + Val = Op1.second >> Op2.second; + OperandStack.push_back(std::make_pair(IC_IMM, Val)); + break; + } + } + } + assert (OperandStack.size() == 1 && "Expected a single result."); + return OperandStack.pop_back_val().second; + } + }; + + enum IntelExprState { + IES_OR, + IES_XOR, + IES_AND, + IES_LSHIFT, + IES_RSHIFT, + IES_PLUS, + IES_MINUS, + IES_NOT, + IES_MULTIPLY, + IES_DIVIDE, + IES_LBRAC, + IES_RBRAC, + IES_LPAREN, + IES_RPAREN, + IES_REGISTER, + IES_INTEGER, + IES_IDENTIFIER, + IES_ERROR + }; + + class IntelExprStateMachine { + IntelExprState State, PrevState; + unsigned BaseReg, IndexReg, TmpReg, Scale; + int64_t Imm; + const MCExpr *Sym; + StringRef SymName; + bool StopOnLBrac, AddImmPrefix; + InfixCalculator IC; + InlineAsmIdentifierInfo Info; + + public: + IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) : + State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0), + Scale(1), Imm(imm), Sym(nullptr), StopOnLBrac(stoponlbrac), + AddImmPrefix(addimmprefix) { Info.clear(); } + + unsigned getBaseReg() { return BaseReg; } + unsigned getIndexReg() { return IndexReg; } + unsigned getScale() { return Scale; } + const MCExpr *getSym() { return Sym; } + StringRef getSymName() { return SymName; } + int64_t getImm() { return Imm + IC.execute(); } + bool isValidEndState() { + return State == IES_RBRAC || State == IES_INTEGER; + } + bool getStopOnLBrac() { return StopOnLBrac; } + bool getAddImmPrefix() { return AddImmPrefix; } + bool hadError() { return State == IES_ERROR; } + + InlineAsmIdentifierInfo &getIdentifierInfo() { + return Info; + } + + void onOr() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_RPAREN: + case IES_REGISTER: + State = IES_OR; + IC.pushOperator(IC_OR); + break; + } + PrevState = CurrState; + } + void onXor() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_RPAREN: + case IES_REGISTER: + State = IES_XOR; + IC.pushOperator(IC_XOR); + break; + } + PrevState = CurrState; + } + void onAnd() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_RPAREN: + case IES_REGISTER: + State = IES_AND; + IC.pushOperator(IC_AND); + break; + } + PrevState = CurrState; + } + void onLShift() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_RPAREN: + case IES_REGISTER: + State = IES_LSHIFT; + IC.pushOperator(IC_LSHIFT); + break; + } + PrevState = CurrState; + } + void onRShift() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_RPAREN: + case IES_REGISTER: + State = IES_RSHIFT; + IC.pushOperator(IC_RSHIFT); + break; + } + PrevState = CurrState; + } + void onPlus() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_RPAREN: + case IES_REGISTER: + State = IES_PLUS; + IC.pushOperator(IC_PLUS); + if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) { + // If we already have a BaseReg, then assume this is the IndexReg with + // a scale of 1. + if (!BaseReg) { + BaseReg = TmpReg; + } else { + assert (!IndexReg && "BaseReg/IndexReg already set!"); + IndexReg = TmpReg; + Scale = 1; + } + } + break; + } + PrevState = CurrState; + } + void onMinus() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_PLUS: + case IES_NOT: + case IES_MULTIPLY: + case IES_DIVIDE: + case IES_LPAREN: + case IES_RPAREN: + case IES_LBRAC: + case IES_RBRAC: + case IES_INTEGER: + case IES_REGISTER: + State = IES_MINUS; + // Only push the minus operator if it is not a unary operator. + if (!(CurrState == IES_PLUS || CurrState == IES_MINUS || + CurrState == IES_MULTIPLY || CurrState == IES_DIVIDE || + CurrState == IES_LPAREN || CurrState == IES_LBRAC)) + IC.pushOperator(IC_MINUS); + if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) { + // If we already have a BaseReg, then assume this is the IndexReg with + // a scale of 1. + if (!BaseReg) { + BaseReg = TmpReg; + } else { + assert (!IndexReg && "BaseReg/IndexReg already set!"); + IndexReg = TmpReg; + Scale = 1; + } + } + break; + } + PrevState = CurrState; + } + void onNot() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_PLUS: + case IES_NOT: + State = IES_NOT; + break; + } + PrevState = CurrState; + } + void onRegister(unsigned Reg) { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_PLUS: + case IES_LPAREN: + State = IES_REGISTER; + TmpReg = Reg; + IC.pushOperand(IC_REGISTER); + break; + case IES_MULTIPLY: + // Index Register - Scale * Register + if (PrevState == IES_INTEGER) { + assert (!IndexReg && "IndexReg already set!"); + State = IES_REGISTER; + IndexReg = Reg; + // Get the scale and replace the 'Scale * Register' with '0'. + Scale = IC.popOperand(); + IC.pushOperand(IC_IMM); + IC.popOperator(); + } else { + State = IES_ERROR; + } + break; + } + PrevState = CurrState; + } + void onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName) { + PrevState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_PLUS: + case IES_MINUS: + case IES_NOT: + State = IES_INTEGER; + Sym = SymRef; + SymName = SymRefName; + IC.pushOperand(IC_IMM); + break; + } + } + bool onInteger(int64_t TmpInt, StringRef &ErrMsg) { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_PLUS: + case IES_MINUS: + case IES_NOT: + case IES_OR: + case IES_XOR: + case IES_AND: + case IES_LSHIFT: + case IES_RSHIFT: + case IES_DIVIDE: + case IES_MULTIPLY: + case IES_LPAREN: + State = IES_INTEGER; + if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) { + // Index Register - Register * Scale + assert (!IndexReg && "IndexReg already set!"); + IndexReg = TmpReg; + Scale = TmpInt; + if(Scale != 1 && Scale != 2 && Scale != 4 && Scale != 8) { + ErrMsg = "scale factor in address must be 1, 2, 4 or 8"; + return true; + } + // Get the scale and replace the 'Register * Scale' with '0'. + IC.popOperator(); + } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS || + PrevState == IES_OR || PrevState == IES_AND || + PrevState == IES_LSHIFT || PrevState == IES_RSHIFT || + PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || + PrevState == IES_LPAREN || PrevState == IES_LBRAC || + PrevState == IES_NOT || PrevState == IES_XOR) && + CurrState == IES_MINUS) { + // Unary minus. No need to pop the minus operand because it was never + // pushed. + IC.pushOperand(IC_IMM, -TmpInt); // Push -Imm. + } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS || + PrevState == IES_OR || PrevState == IES_AND || + PrevState == IES_LSHIFT || PrevState == IES_RSHIFT || + PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || + PrevState == IES_LPAREN || PrevState == IES_LBRAC || + PrevState == IES_NOT || PrevState == IES_XOR) && + CurrState == IES_NOT) { + // Unary not. No need to pop the not operand because it was never + // pushed. + IC.pushOperand(IC_IMM, ~TmpInt); // Push ~Imm. + } else { + IC.pushOperand(IC_IMM, TmpInt); + } + break; + } + PrevState = CurrState; + return false; + } + void onStar() { + PrevState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_REGISTER: + case IES_RPAREN: + State = IES_MULTIPLY; + IC.pushOperator(IC_MULTIPLY); + break; + } + } + void onDivide() { + PrevState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_RPAREN: + State = IES_DIVIDE; + IC.pushOperator(IC_DIVIDE); + break; + } + } + void onLBrac() { + PrevState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_RBRAC: + State = IES_PLUS; + IC.pushOperator(IC_PLUS); + break; + } + } + void onRBrac() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_REGISTER: + case IES_RPAREN: + State = IES_RBRAC; + if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) { + // If we already have a BaseReg, then assume this is the IndexReg with + // a scale of 1. + if (!BaseReg) { + BaseReg = TmpReg; + } else { + assert (!IndexReg && "BaseReg/IndexReg already set!"); + IndexReg = TmpReg; + Scale = 1; + } + } + break; + } + PrevState = CurrState; + } + void onLParen() { + IntelExprState CurrState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_PLUS: + case IES_MINUS: + case IES_NOT: + case IES_OR: + case IES_XOR: + case IES_AND: + case IES_LSHIFT: + case IES_RSHIFT: + case IES_MULTIPLY: + case IES_DIVIDE: + case IES_LPAREN: + // FIXME: We don't handle this type of unary minus or not, yet. + if ((PrevState == IES_PLUS || PrevState == IES_MINUS || + PrevState == IES_OR || PrevState == IES_AND || + PrevState == IES_LSHIFT || PrevState == IES_RSHIFT || + PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE || + PrevState == IES_LPAREN || PrevState == IES_LBRAC || + PrevState == IES_NOT || PrevState == IES_XOR) && + (CurrState == IES_MINUS || CurrState == IES_NOT)) { + State = IES_ERROR; + break; + } + State = IES_LPAREN; + IC.pushOperator(IC_LPAREN); + break; + } + PrevState = CurrState; + } + void onRParen() { + PrevState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_INTEGER: + case IES_REGISTER: + case IES_RPAREN: + State = IES_RPAREN; + IC.pushOperator(IC_RPAREN); + break; + } + } + }; + + bool Error(SMLoc L, const Twine &Msg, + ArrayRef<SMRange> Ranges = None, + bool MatchingInlineAsm = false) { + MCAsmParser &Parser = getParser(); + if (MatchingInlineAsm) return true; + return Parser.Error(L, Msg, Ranges); + } + + bool ErrorAndEatStatement(SMLoc L, const Twine &Msg, + ArrayRef<SMRange> Ranges = None, + bool MatchingInlineAsm = false) { + MCAsmParser &Parser = getParser(); + Parser.eatToEndOfStatement(); + return Error(L, Msg, Ranges, MatchingInlineAsm); + } + + std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg) { + Error(Loc, Msg); + return nullptr; + } + + std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc); + std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc); + void AddDefaultSrcDestOperands( + OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src, + std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst); + std::unique_ptr<X86Operand> ParseOperand(); + std::unique_ptr<X86Operand> ParseATTOperand(); + std::unique_ptr<X86Operand> ParseIntelOperand(); + std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator(); + bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp); + std::unique_ptr<X86Operand> ParseIntelOperator(unsigned OpKind); + std::unique_ptr<X86Operand> + ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size); + std::unique_ptr<X86Operand> + ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc, unsigned Size); + std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start, SMLoc End); + bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End); + std::unique_ptr<X86Operand> ParseIntelBracExpression(unsigned SegReg, + SMLoc Start, + int64_t ImmDisp, + unsigned Size); + bool ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier, + InlineAsmIdentifierInfo &Info, + bool IsUnevaluatedOperand, SMLoc &End); + + std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg, SMLoc StartLoc); + + std::unique_ptr<X86Operand> + CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, + unsigned IndexReg, unsigned Scale, SMLoc Start, + SMLoc End, unsigned Size, StringRef Identifier, + InlineAsmIdentifierInfo &Info); + + bool parseDirectiveEven(SMLoc L); + bool ParseDirectiveWord(unsigned Size, SMLoc L); + bool ParseDirectiveCode(StringRef IDVal, SMLoc L); + + bool processInstruction(MCInst &Inst, const OperandVector &Ops); + + /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds + /// instrumentation around Inst. + void EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out); + + bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) override; + + void MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, OperandVector &Operands, + MCStreamer &Out, bool MatchingInlineAsm); + + bool ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo, + bool MatchingInlineAsm); + + bool MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm); + + bool MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm); + + bool OmitRegisterFromClobberLists(unsigned RegNo) override; + + /// doSrcDstMatch - Returns true if operands are matching in their + /// word size (%si and %di, %esi and %edi, etc.). Order depends on + /// the parsing mode (Intel vs. AT&T). + bool doSrcDstMatch(X86Operand &Op1, X86Operand &Op2); + + /// Parses AVX512 specific operand primitives: masked registers ({%k<NUM>}, {z}) + /// and memory broadcasting ({1to<NUM>}) primitives, updating Operands vector if required. + /// \return \c true if no parsing errors occurred, \c false otherwise. + bool HandleAVX512Operand(OperandVector &Operands, + const MCParsedAsmOperand &Op); + + bool is64BitMode() const { + // FIXME: Can tablegen auto-generate this? + return getSTI().getFeatureBits()[X86::Mode64Bit]; + } + bool is32BitMode() const { + // FIXME: Can tablegen auto-generate this? + return getSTI().getFeatureBits()[X86::Mode32Bit]; + } + bool is16BitMode() const { + // FIXME: Can tablegen auto-generate this? + return getSTI().getFeatureBits()[X86::Mode16Bit]; + } + void SwitchMode(unsigned mode) { + MCSubtargetInfo &STI = copySTI(); + FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit}); + FeatureBitset OldMode = STI.getFeatureBits() & AllModes; + unsigned FB = ComputeAvailableFeatures( + STI.ToggleFeature(OldMode.flip(mode))); + setAvailableFeatures(FB); + + assert(FeatureBitset({mode}) == (STI.getFeatureBits() & AllModes)); + } + + unsigned getPointerWidth() { + if (is16BitMode()) return 16; + if (is32BitMode()) return 32; + if (is64BitMode()) return 64; + llvm_unreachable("invalid mode"); + } + + bool isParsingIntelSyntax() { + return getParser().getAssemblerDialect(); + } + + /// @name Auto-generated Matcher Functions + /// { + +#define GET_ASSEMBLER_HEADER +#include "X86GenAsmMatcher.inc" + + /// } + +public: + X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser, + const MCInstrInfo &mii, const MCTargetOptions &Options) + : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr) { + + // Initialize the set of available features. + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); + Instrumentation.reset( + CreateX86AsmInstrumentation(Options, Parser.getContext(), STI)); + } + + bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; + + void SetFrameRegister(unsigned RegNo) override; + + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override; + + bool ParseDirective(AsmToken DirectiveID) override; +}; +} // end anonymous namespace + +/// @name Auto-generated Match Functions +/// { + +static unsigned MatchRegisterName(StringRef Name); + +/// } + +static bool CheckBaseRegAndIndexReg(unsigned BaseReg, unsigned IndexReg, + StringRef &ErrMsg) { + // If we have both a base register and an index register make sure they are + // both 64-bit or 32-bit registers. + // To support VSIB, IndexReg can be 128-bit or 256-bit registers. + if (BaseReg != 0 && IndexReg != 0) { + if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) && + (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) || + X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)) && + IndexReg != X86::RIZ) { + ErrMsg = "base register is 64-bit, but index register is not"; + return true; + } + if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) && + (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) || + X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) && + IndexReg != X86::EIZ){ + ErrMsg = "base register is 32-bit, but index register is not"; + return true; + } + if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg)) { + if (X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) || + X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) { + ErrMsg = "base register is 16-bit, but index register is not"; + return true; + } + if (((BaseReg == X86::BX || BaseReg == X86::BP) && + IndexReg != X86::SI && IndexReg != X86::DI) || + ((BaseReg == X86::SI || BaseReg == X86::DI) && + IndexReg != X86::BX && IndexReg != X86::BP)) { + ErrMsg = "invalid 16-bit base/index register combination"; + return true; + } + } + } + return false; +} + +bool X86AsmParser::doSrcDstMatch(X86Operand &Op1, X86Operand &Op2) +{ + // Return true and let a normal complaint about bogus operands happen. + if (!Op1.isMem() || !Op2.isMem()) + return true; + + // Actually these might be the other way round if Intel syntax is + // being used. It doesn't matter. + unsigned diReg = Op1.Mem.BaseReg; + unsigned siReg = Op2.Mem.BaseReg; + + if (X86MCRegisterClasses[X86::GR16RegClassID].contains(siReg)) + return X86MCRegisterClasses[X86::GR16RegClassID].contains(diReg); + if (X86MCRegisterClasses[X86::GR32RegClassID].contains(siReg)) + return X86MCRegisterClasses[X86::GR32RegClassID].contains(diReg); + if (X86MCRegisterClasses[X86::GR64RegClassID].contains(siReg)) + return X86MCRegisterClasses[X86::GR64RegClassID].contains(diReg); + // Again, return true and let another error happen. + return true; +} + +bool X86AsmParser::ParseRegister(unsigned &RegNo, + SMLoc &StartLoc, SMLoc &EndLoc) { + MCAsmParser &Parser = getParser(); + RegNo = 0; + const AsmToken &PercentTok = Parser.getTok(); + StartLoc = PercentTok.getLoc(); + + // If we encounter a %, ignore it. This code handles registers with and + // without the prefix, unprefixed registers can occur in cfi directives. + if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent)) + Parser.Lex(); // Eat percent token. + + const AsmToken &Tok = Parser.getTok(); + EndLoc = Tok.getEndLoc(); + + if (Tok.isNot(AsmToken::Identifier)) { + if (isParsingIntelSyntax()) return true; + return Error(StartLoc, "invalid register name", + SMRange(StartLoc, EndLoc)); + } + + RegNo = MatchRegisterName(Tok.getString()); + + // If the match failed, try the register name as lowercase. + if (RegNo == 0) + RegNo = MatchRegisterName(Tok.getString().lower()); + + // The "flags" register cannot be referenced directly. + // Treat it as an identifier instead. + if (isParsingInlineAsm() && isParsingIntelSyntax() && RegNo == X86::EFLAGS) + RegNo = 0; + + if (!is64BitMode()) { + // FIXME: This should be done using Requires<Not64BitMode> and + // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also + // checked. + // FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a + // REX prefix. + if (RegNo == X86::RIZ || + X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) || + X86II::isX86_64NonExtLowByteReg(RegNo) || + X86II::isX86_64ExtendedReg(RegNo)) + return Error(StartLoc, "register %" + + Tok.getString() + " is only available in 64-bit mode", + SMRange(StartLoc, EndLoc)); + } + + // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens. + if (RegNo == 0 && (Tok.getString() == "st" || Tok.getString() == "ST")) { + RegNo = X86::ST0; + Parser.Lex(); // Eat 'st' + + // Check to see if we have '(4)' after %st. + if (getLexer().isNot(AsmToken::LParen)) + return false; + // Lex the paren. + getParser().Lex(); + + const AsmToken &IntTok = Parser.getTok(); + if (IntTok.isNot(AsmToken::Integer)) + return Error(IntTok.getLoc(), "expected stack index"); + switch (IntTok.getIntVal()) { + case 0: RegNo = X86::ST0; break; + case 1: RegNo = X86::ST1; break; + case 2: RegNo = X86::ST2; break; + case 3: RegNo = X86::ST3; break; + case 4: RegNo = X86::ST4; break; + case 5: RegNo = X86::ST5; break; + case 6: RegNo = X86::ST6; break; + case 7: RegNo = X86::ST7; break; + default: return Error(IntTok.getLoc(), "invalid stack index"); + } + + if (getParser().Lex().isNot(AsmToken::RParen)) + return Error(Parser.getTok().getLoc(), "expected ')'"); + + EndLoc = Parser.getTok().getEndLoc(); + Parser.Lex(); // Eat ')' + return false; + } + + EndLoc = Parser.getTok().getEndLoc(); + + // If this is "db[0-7]", match it as an alias + // for dr[0-7]. + if (RegNo == 0 && Tok.getString().size() == 3 && + Tok.getString().startswith("db")) { + switch (Tok.getString()[2]) { + case '0': RegNo = X86::DR0; break; + case '1': RegNo = X86::DR1; break; + case '2': RegNo = X86::DR2; break; + case '3': RegNo = X86::DR3; break; + case '4': RegNo = X86::DR4; break; + case '5': RegNo = X86::DR5; break; + case '6': RegNo = X86::DR6; break; + case '7': RegNo = X86::DR7; break; + } + + if (RegNo != 0) { + EndLoc = Parser.getTok().getEndLoc(); + Parser.Lex(); // Eat it. + return false; + } + } + + if (RegNo == 0) { + if (isParsingIntelSyntax()) return true; + return Error(StartLoc, "invalid register name", + SMRange(StartLoc, EndLoc)); + } + + Parser.Lex(); // Eat identifier token. + return false; +} + +void X86AsmParser::SetFrameRegister(unsigned RegNo) { + Instrumentation->SetInitialFrameRegister(RegNo); +} + +std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) { + unsigned basereg = + is64BitMode() ? X86::RSI : (is32BitMode() ? X86::ESI : X86::SI); + const MCExpr *Disp = MCConstantExpr::create(0, getContext()); + return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp, + /*BaseReg=*/basereg, /*IndexReg=*/0, /*Scale=*/1, + Loc, Loc, 0); +} + +std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) { + unsigned basereg = + is64BitMode() ? X86::RDI : (is32BitMode() ? X86::EDI : X86::DI); + const MCExpr *Disp = MCConstantExpr::create(0, getContext()); + return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp, + /*BaseReg=*/basereg, /*IndexReg=*/0, /*Scale=*/1, + Loc, Loc, 0); +} + +void X86AsmParser::AddDefaultSrcDestOperands( + OperandVector& Operands, std::unique_ptr<llvm::MCParsedAsmOperand> &&Src, + std::unique_ptr<llvm::MCParsedAsmOperand> &&Dst) { + if (isParsingIntelSyntax()) { + Operands.push_back(std::move(Dst)); + Operands.push_back(std::move(Src)); + } + else { + Operands.push_back(std::move(Src)); + Operands.push_back(std::move(Dst)); + } +} + +std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() { + if (isParsingIntelSyntax()) + return ParseIntelOperand(); + return ParseATTOperand(); +} + +/// getIntelMemOperandSize - Return intel memory operand size. +static unsigned getIntelMemOperandSize(StringRef OpStr) { + unsigned Size = StringSwitch<unsigned>(OpStr) + .Cases("BYTE", "byte", 8) + .Cases("WORD", "word", 16) + .Cases("DWORD", "dword", 32) + .Cases("FWORD", "fword", 48) + .Cases("QWORD", "qword", 64) + .Cases("MMWORD","mmword", 64) + .Cases("XWORD", "xword", 80) + .Cases("TBYTE", "tbyte", 80) + .Cases("XMMWORD", "xmmword", 128) + .Cases("YMMWORD", "ymmword", 256) + .Cases("ZMMWORD", "zmmword", 512) + .Cases("OPAQUE", "opaque", -1U) // needs to be non-zero, but doesn't matter + .Default(0); + return Size; +} + +std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( + unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg, + unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier, + InlineAsmIdentifierInfo &Info) { + // If we found a decl other than a VarDecl, then assume it is a FuncDecl or + // some other label reference. + if (isa<MCSymbolRefExpr>(Disp) && Info.OpDecl && !Info.IsVarDecl) { + // Insert an explicit size if the user didn't have one. + if (!Size) { + Size = getPointerWidth(); + InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start, + /*Len=*/0, Size); + } + + // Create an absolute memory reference in order to match against + // instructions taking a PC relative operand. + return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size, + Identifier, Info.OpDecl); + } + + // We either have a direct symbol reference, or an offset from a symbol. The + // parser always puts the symbol on the LHS, so look there for size + // calculation purposes. + const MCBinaryExpr *BinOp = dyn_cast<MCBinaryExpr>(Disp); + bool IsSymRef = + isa<MCSymbolRefExpr>(BinOp ? BinOp->getLHS() : Disp); + if (IsSymRef) { + if (!Size) { + Size = Info.Type * 8; // Size is in terms of bits in this context. + if (Size) + InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start, + /*Len=*/0, Size); + } + } + + // When parsing inline assembly we set the base register to a non-zero value + // if we don't know the actual value at this time. This is necessary to + // get the matching correct in some cases. + BaseReg = BaseReg ? BaseReg : 1; + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg, + IndexReg, Scale, Start, End, Size, Identifier, + Info.OpDecl); +} + +static void +RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites, + StringRef SymName, int64_t ImmDisp, + int64_t FinalImmDisp, SMLoc &BracLoc, + SMLoc &StartInBrac, SMLoc &End) { + // Remove the '[' and ']' from the IR string. + AsmRewrites.emplace_back(AOK_Skip, BracLoc, 1); + AsmRewrites.emplace_back(AOK_Skip, End, 1); + + // If ImmDisp is non-zero, then we parsed a displacement before the + // bracketed expression (i.e., ImmDisp [ BaseReg + Scale*IndexReg + Disp]) + // If ImmDisp doesn't match the displacement computed by the state machine + // then we have an additional displacement in the bracketed expression. + if (ImmDisp != FinalImmDisp) { + if (ImmDisp) { + // We have an immediate displacement before the bracketed expression. + // Adjust this to match the final immediate displacement. + bool Found = false; + for (AsmRewrite &AR : AsmRewrites) { + if (AR.Loc.getPointer() > BracLoc.getPointer()) + continue; + if (AR.Kind == AOK_ImmPrefix || AR.Kind == AOK_Imm) { + assert (!Found && "ImmDisp already rewritten."); + AR.Kind = AOK_Imm; + AR.Len = BracLoc.getPointer() - AR.Loc.getPointer(); + AR.Val = FinalImmDisp; + Found = true; + break; + } + } + assert (Found && "Unable to rewrite ImmDisp."); + (void)Found; + } else { + // We have a symbolic and an immediate displacement, but no displacement + // before the bracketed expression. Put the immediate displacement + // before the bracketed expression. + AsmRewrites.emplace_back(AOK_Imm, BracLoc, 0, FinalImmDisp); + } + } + // Remove all the ImmPrefix rewrites within the brackets. + for (AsmRewrite &AR : AsmRewrites) { + if (AR.Loc.getPointer() < StartInBrac.getPointer()) + continue; + if (AR.Kind == AOK_ImmPrefix) + AR.Kind = AOK_Delete; + } + const char *SymLocPtr = SymName.data(); + // Skip everything before the symbol. + if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) { + assert(Len > 0 && "Expected a non-negative length."); + AsmRewrites.emplace_back(AOK_Skip, StartInBrac, Len); + } + // Skip everything after the symbol. + if (unsigned Len = End.getPointer() - (SymLocPtr + SymName.size())) { + SMLoc Loc = SMLoc::getFromPointer(SymLocPtr + SymName.size()); + assert(Len > 0 && "Expected a non-negative length."); + AsmRewrites.emplace_back(AOK_Skip, Loc, Len); + } +} + +bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + + AsmToken::TokenKind PrevTK = AsmToken::Error; + bool Done = false; + while (!Done) { + bool UpdateLocLex = true; + + // The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an + // identifier. Don't try an parse it as a register. + if (Tok.getString().startswith(".")) + break; + + // If we're parsing an immediate expression, we don't expect a '['. + if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac) + break; + + AsmToken::TokenKind TK = getLexer().getKind(); + switch (TK) { + default: { + if (SM.isValidEndState()) { + Done = true; + break; + } + return Error(Tok.getLoc(), "unknown token in expression"); + } + case AsmToken::EndOfStatement: { + Done = true; + break; + } + case AsmToken::String: + case AsmToken::Identifier: { + // This could be a register or a symbolic displacement. + unsigned TmpReg; + const MCExpr *Val; + SMLoc IdentLoc = Tok.getLoc(); + StringRef Identifier = Tok.getString(); + if (TK != AsmToken::String && !ParseRegister(TmpReg, IdentLoc, End)) { + SM.onRegister(TmpReg); + UpdateLocLex = false; + break; + } else { + if (!isParsingInlineAsm()) { + if (getParser().parsePrimaryExpr(Val, End)) + return Error(Tok.getLoc(), "Unexpected identifier!"); + } else { + // This is a dot operator, not an adjacent identifier. + if (Identifier.find('.') != StringRef::npos && + PrevTK == AsmToken::RBrac) { + return false; + } else { + InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo(); + if (ParseIntelIdentifier(Val, Identifier, Info, + /*Unevaluated=*/false, End)) + return true; + } + } + SM.onIdentifierExpr(Val, Identifier); + UpdateLocLex = false; + break; + } + return Error(Tok.getLoc(), "Unexpected identifier!"); + } + case AsmToken::Integer: { + StringRef ErrMsg; + if (isParsingInlineAsm() && SM.getAddImmPrefix()) + InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Tok.getLoc()); + // Look for 'b' or 'f' following an Integer as a directional label + SMLoc Loc = getTok().getLoc(); + int64_t IntVal = getTok().getIntVal(); + End = consumeToken(); + UpdateLocLex = false; + if (getLexer().getKind() == AsmToken::Identifier) { + StringRef IDVal = getTok().getString(); + if (IDVal == "f" || IDVal == "b") { + MCSymbol *Sym = + getContext().getDirectionalLocalSymbol(IntVal, IDVal == "b"); + MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; + const MCExpr *Val = + MCSymbolRefExpr::create(Sym, Variant, getContext()); + if (IDVal == "b" && Sym->isUndefined()) + return Error(Loc, "invalid reference to undefined symbol"); + StringRef Identifier = Sym->getName(); + SM.onIdentifierExpr(Val, Identifier); + End = consumeToken(); + } else { + if (SM.onInteger(IntVal, ErrMsg)) + return Error(Loc, ErrMsg); + } + } else { + if (SM.onInteger(IntVal, ErrMsg)) + return Error(Loc, ErrMsg); + } + break; + } + case AsmToken::Plus: SM.onPlus(); break; + case AsmToken::Minus: SM.onMinus(); break; + case AsmToken::Tilde: SM.onNot(); break; + case AsmToken::Star: SM.onStar(); break; + case AsmToken::Slash: SM.onDivide(); break; + case AsmToken::Pipe: SM.onOr(); break; + case AsmToken::Caret: SM.onXor(); break; + case AsmToken::Amp: SM.onAnd(); break; + case AsmToken::LessLess: + SM.onLShift(); break; + case AsmToken::GreaterGreater: + SM.onRShift(); break; + case AsmToken::LBrac: SM.onLBrac(); break; + case AsmToken::RBrac: SM.onRBrac(); break; + case AsmToken::LParen: SM.onLParen(); break; + case AsmToken::RParen: SM.onRParen(); break; + } + if (SM.hadError()) + return Error(Tok.getLoc(), "unknown token in expression"); + + if (!Done && UpdateLocLex) + End = consumeToken(); + + PrevTK = TK; + } + return false; +} + +std::unique_ptr<X86Operand> +X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, + int64_t ImmDisp, unsigned Size) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc(); + if (getLexer().isNot(AsmToken::LBrac)) + return ErrorOperand(BracLoc, "Expected '[' token!"); + Parser.Lex(); // Eat '[' + + SMLoc StartInBrac = Tok.getLoc(); + // Parse [ Symbol + ImmDisp ] and [ BaseReg + Scale*IndexReg + ImmDisp ]. We + // may have already parsed an immediate displacement before the bracketed + // expression. + IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true); + if (ParseIntelExpression(SM, End)) + return nullptr; + + const MCExpr *Disp = nullptr; + if (const MCExpr *Sym = SM.getSym()) { + // A symbolic displacement. + Disp = Sym; + if (isParsingInlineAsm()) + RewriteIntelBracExpression(*InstInfo->AsmRewrites, SM.getSymName(), + ImmDisp, SM.getImm(), BracLoc, StartInBrac, + End); + } + + if (SM.getImm() || !Disp) { + const MCExpr *Imm = MCConstantExpr::create(SM.getImm(), getContext()); + if (Disp) + Disp = MCBinaryExpr::createAdd(Disp, Imm, getContext()); + else + Disp = Imm; // An immediate displacement only. + } + + // Parse struct field access. Intel requires a dot, but MSVC doesn't. MSVC + // will in fact do global lookup the field name inside all global typedefs, + // but we don't emulate that. + if (Tok.getString().find('.') != StringRef::npos) { + const MCExpr *NewDisp; + if (ParseIntelDotOperator(Disp, NewDisp)) + return nullptr; + + End = Tok.getEndLoc(); + Parser.Lex(); // Eat the field. + Disp = NewDisp; + } + + int BaseReg = SM.getBaseReg(); + int IndexReg = SM.getIndexReg(); + int Scale = SM.getScale(); + if (!isParsingInlineAsm()) { + // handle [-42] + if (!BaseReg && !IndexReg) { + if (!SegReg) + return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size); + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1, + Start, End, Size); + } + StringRef ErrMsg; + if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) { + Error(StartInBrac, ErrMsg); + return nullptr; + } + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg, + IndexReg, Scale, Start, End, Size); + } + + InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo(); + return CreateMemForInlineAsm(SegReg, Disp, BaseReg, IndexReg, Scale, Start, + End, Size, SM.getSymName(), Info); +} + +// Inline assembly may use variable names with namespace alias qualifiers. +bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, + StringRef &Identifier, + InlineAsmIdentifierInfo &Info, + bool IsUnevaluatedOperand, SMLoc &End) { + MCAsmParser &Parser = getParser(); + assert(isParsingInlineAsm() && "Expected to be parsing inline assembly."); + Val = nullptr; + + StringRef LineBuf(Identifier.data()); + void *Result = + SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand); + + const AsmToken &Tok = Parser.getTok(); + SMLoc Loc = Tok.getLoc(); + + // Advance the token stream until the end of the current token is + // after the end of what the frontend claimed. + const char *EndPtr = Tok.getLoc().getPointer() + LineBuf.size(); + do { + End = Tok.getEndLoc(); + getLexer().Lex(); + } while (End.getPointer() < EndPtr); + Identifier = LineBuf; + + // The frontend should end parsing on an assembler token boundary, unless it + // failed parsing. + assert((End.getPointer() == EndPtr || !Result) && + "frontend claimed part of a token?"); + + // If the identifier lookup was unsuccessful, assume that we are dealing with + // a label. + if (!Result) { + StringRef InternalName = + SemaCallback->LookupInlineAsmLabel(Identifier, getSourceManager(), + Loc, false); + assert(InternalName.size() && "We should have an internal name here."); + // Push a rewrite for replacing the identifier name with the internal name. + InstInfo->AsmRewrites->emplace_back(AOK_Label, Loc, Identifier.size(), + InternalName); + } + + // Create the symbol reference. + MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier); + MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; + Val = MCSymbolRefExpr::create(Sym, Variant, getParser().getContext()); + return false; +} + +/// \brief Parse intel style segment override. +std::unique_ptr<X86Operand> +X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, + unsigned Size) { + MCAsmParser &Parser = getParser(); + assert(SegReg != 0 && "Tried to parse a segment override without a segment!"); + const AsmToken &Tok = Parser.getTok(); // Eat colon. + if (Tok.isNot(AsmToken::Colon)) + return ErrorOperand(Tok.getLoc(), "Expected ':' token!"); + Parser.Lex(); // Eat ':' + + int64_t ImmDisp = 0; + if (getLexer().is(AsmToken::Integer)) { + ImmDisp = Tok.getIntVal(); + AsmToken ImmDispToken = Parser.Lex(); // Eat the integer. + + if (isParsingInlineAsm()) + InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, ImmDispToken.getLoc()); + + if (getLexer().isNot(AsmToken::LBrac)) { + // An immediate following a 'segment register', 'colon' token sequence can + // be followed by a bracketed expression. If it isn't we know we have our + // final segment override. + const MCExpr *Disp = MCConstantExpr::create(ImmDisp, getContext()); + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, + /*BaseReg=*/0, /*IndexReg=*/0, /*Scale=*/1, + Start, ImmDispToken.getEndLoc(), Size); + } + } + + if (getLexer().is(AsmToken::LBrac)) + return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size); + + const MCExpr *Val; + SMLoc End; + if (!isParsingInlineAsm()) { + if (getParser().parsePrimaryExpr(Val, End)) + return ErrorOperand(Tok.getLoc(), "unknown token in expression"); + + return X86Operand::CreateMem(getPointerWidth(), Val, Start, End, Size); + } + + InlineAsmIdentifierInfo Info; + StringRef Identifier = Tok.getString(); + if (ParseIntelIdentifier(Val, Identifier, Info, + /*Unevaluated=*/false, End)) + return nullptr; + return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0, + /*Scale=*/1, Start, End, Size, Identifier, Info); +} + +//ParseRoundingModeOp - Parse AVX-512 rounding mode operand +std::unique_ptr<X86Operand> +X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + // Eat "{" and mark the current place. + const SMLoc consumedToken = consumeToken(); + if (Tok.getIdentifier().startswith("r")){ + int rndMode = StringSwitch<int>(Tok.getIdentifier()) + .Case("rn", X86::STATIC_ROUNDING::TO_NEAREST_INT) + .Case("rd", X86::STATIC_ROUNDING::TO_NEG_INF) + .Case("ru", X86::STATIC_ROUNDING::TO_POS_INF) + .Case("rz", X86::STATIC_ROUNDING::TO_ZERO) + .Default(-1); + if (-1 == rndMode) + return ErrorOperand(Tok.getLoc(), "Invalid rounding mode."); + Parser.Lex(); // Eat "r*" of r*-sae + if (!getLexer().is(AsmToken::Minus)) + return ErrorOperand(Tok.getLoc(), "Expected - at this point"); + Parser.Lex(); // Eat "-" + Parser.Lex(); // Eat the sae + if (!getLexer().is(AsmToken::RCurly)) + return ErrorOperand(Tok.getLoc(), "Expected } at this point"); + Parser.Lex(); // Eat "}" + const MCExpr *RndModeOp = + MCConstantExpr::create(rndMode, Parser.getContext()); + return X86Operand::CreateImm(RndModeOp, Start, End); + } + if(Tok.getIdentifier().equals("sae")){ + Parser.Lex(); // Eat the sae + if (!getLexer().is(AsmToken::RCurly)) + return ErrorOperand(Tok.getLoc(), "Expected } at this point"); + Parser.Lex(); // Eat "}" + return X86Operand::CreateToken("{sae}", consumedToken); + } + return ErrorOperand(Tok.getLoc(), "unknown token in expression"); +} +/// ParseIntelMemOperand - Parse intel style memory operand. +std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, + SMLoc Start, + unsigned Size) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + SMLoc End; + + // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ]. + if (getLexer().is(AsmToken::LBrac)) + return ParseIntelBracExpression(/*SegReg=*/0, Start, ImmDisp, Size); + assert(ImmDisp == 0); + + const MCExpr *Val; + if (!isParsingInlineAsm()) { + if (getParser().parsePrimaryExpr(Val, End)) + return ErrorOperand(Tok.getLoc(), "unknown token in expression"); + + return X86Operand::CreateMem(getPointerWidth(), Val, Start, End, Size); + } + + InlineAsmIdentifierInfo Info; + StringRef Identifier = Tok.getString(); + if (ParseIntelIdentifier(Val, Identifier, Info, + /*Unevaluated=*/false, End)) + return nullptr; + + if (!getLexer().is(AsmToken::LBrac)) + return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0, /*IndexReg=*/0, + /*Scale=*/1, Start, End, Size, Identifier, Info); + + Parser.Lex(); // Eat '[' + + // Parse Identifier [ ImmDisp ] + IntelExprStateMachine SM(/*ImmDisp=*/0, /*StopOnLBrac=*/true, + /*AddImmPrefix=*/false); + if (ParseIntelExpression(SM, End)) + return nullptr; + + if (SM.getSym()) { + Error(Start, "cannot use more than one symbol in memory operand"); + return nullptr; + } + if (SM.getBaseReg()) { + Error(Start, "cannot use base register with variable reference"); + return nullptr; + } + if (SM.getIndexReg()) { + Error(Start, "cannot use index register with variable reference"); + return nullptr; + } + + const MCExpr *Disp = MCConstantExpr::create(SM.getImm(), getContext()); + // BaseReg is non-zero to avoid assertions. In the context of inline asm, + // we're pointing to a local variable in memory, so the base register is + // really the frame or stack pointer. + return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp, + /*BaseReg=*/1, /*IndexReg=*/0, /*Scale=*/1, + Start, End, Size, Identifier, Info.OpDecl); +} + +/// Parse the '.' operator. +bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, + const MCExpr *&NewDisp) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + int64_t OrigDispVal, DotDispVal; + + // FIXME: Handle non-constant expressions. + if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp)) + OrigDispVal = OrigDisp->getValue(); + else + return Error(Tok.getLoc(), "Non-constant offsets are not supported!"); + + // Drop the optional '.'. + StringRef DotDispStr = Tok.getString(); + if (DotDispStr.startswith(".")) + DotDispStr = DotDispStr.drop_front(1); + + // .Imm gets lexed as a real. + if (Tok.is(AsmToken::Real)) { + APInt DotDisp; + DotDispStr.getAsInteger(10, DotDisp); + DotDispVal = DotDisp.getZExtValue(); + } else if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) { + unsigned DotDisp; + std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.'); + if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second, + DotDisp)) + return Error(Tok.getLoc(), "Unable to lookup field reference!"); + DotDispVal = DotDisp; + } else + return Error(Tok.getLoc(), "Unexpected token type!"); + + if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) { + SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data()); + unsigned Len = DotDispStr.size(); + unsigned Val = OrigDispVal + DotDispVal; + InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, Val); + } + + NewDisp = MCConstantExpr::create(OrigDispVal + DotDispVal, getContext()); + return false; +} + +/// Parse the 'offset' operator. This operator is used to specify the +/// location rather then the content of a variable. +std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + SMLoc OffsetOfLoc = Tok.getLoc(); + Parser.Lex(); // Eat offset. + + const MCExpr *Val; + InlineAsmIdentifierInfo Info; + SMLoc Start = Tok.getLoc(), End; + StringRef Identifier = Tok.getString(); + if (ParseIntelIdentifier(Val, Identifier, Info, + /*Unevaluated=*/false, End)) + return nullptr; + + // Don't emit the offset operator. + InstInfo->AsmRewrites->emplace_back(AOK_Skip, OffsetOfLoc, 7); + + // The offset operator will have an 'r' constraint, thus we need to create + // register operand to ensure proper matching. Just pick a GPR based on + // the size of a pointer. + unsigned RegNo = + is64BitMode() ? X86::RBX : (is32BitMode() ? X86::EBX : X86::BX); + return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true, + OffsetOfLoc, Identifier, Info.OpDecl); +} + +enum IntelOperatorKind { + IOK_LENGTH, + IOK_SIZE, + IOK_TYPE +}; + +/// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators. The LENGTH operator +/// returns the number of elements in an array. It returns the value 1 for +/// non-array variables. The SIZE operator returns the size of a C or C++ +/// variable. A variable's size is the product of its LENGTH and TYPE. The +/// TYPE operator returns the size of a C or C++ type or variable. If the +/// variable is an array, TYPE returns the size of a single element. +std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + SMLoc TypeLoc = Tok.getLoc(); + Parser.Lex(); // Eat operator. + + const MCExpr *Val = nullptr; + InlineAsmIdentifierInfo Info; + SMLoc Start = Tok.getLoc(), End; + StringRef Identifier = Tok.getString(); + if (ParseIntelIdentifier(Val, Identifier, Info, + /*Unevaluated=*/true, End)) + return nullptr; + + if (!Info.OpDecl) + return ErrorOperand(Start, "unable to lookup expression"); + + unsigned CVal = 0; + switch(OpKind) { + default: llvm_unreachable("Unexpected operand kind!"); + case IOK_LENGTH: CVal = Info.Length; break; + case IOK_SIZE: CVal = Info.Size; break; + case IOK_TYPE: CVal = Info.Type; break; + } + + // Rewrite the type operator and the C or C++ type or variable in terms of an + // immediate. E.g. TYPE foo -> $$4 + unsigned Len = End.getPointer() - TypeLoc.getPointer(); + InstInfo->AsmRewrites->emplace_back(AOK_Imm, TypeLoc, Len, CVal); + + const MCExpr *Imm = MCConstantExpr::create(CVal, getContext()); + return X86Operand::CreateImm(Imm, Start, End); +} + +std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { + MCAsmParser &Parser = getParser(); + const AsmToken &Tok = Parser.getTok(); + SMLoc Start, End; + + // Offset, length, type and size operators. + if (isParsingInlineAsm()) { + StringRef AsmTokStr = Tok.getString(); + if (AsmTokStr == "offset" || AsmTokStr == "OFFSET") + return ParseIntelOffsetOfOperator(); + if (AsmTokStr == "length" || AsmTokStr == "LENGTH") + return ParseIntelOperator(IOK_LENGTH); + if (AsmTokStr == "size" || AsmTokStr == "SIZE") + return ParseIntelOperator(IOK_SIZE); + if (AsmTokStr == "type" || AsmTokStr == "TYPE") + return ParseIntelOperator(IOK_TYPE); + } + + bool PtrInOperand = false; + unsigned Size = getIntelMemOperandSize(Tok.getString()); + if (Size) { + Parser.Lex(); // Eat operand size (e.g., byte, word). + if (Tok.getString() != "PTR" && Tok.getString() != "ptr") + return ErrorOperand(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!"); + Parser.Lex(); // Eat ptr. + PtrInOperand = true; + } + Start = Tok.getLoc(); + + // Immediate. + if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Minus) || + getLexer().is(AsmToken::Tilde) || getLexer().is(AsmToken::LParen)) { + AsmToken StartTok = Tok; + IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true, + /*AddImmPrefix=*/false); + if (ParseIntelExpression(SM, End)) + return nullptr; + + int64_t Imm = SM.getImm(); + if (isParsingInlineAsm()) { + unsigned Len = Tok.getLoc().getPointer() - Start.getPointer(); + if (StartTok.getString().size() == Len) + // Just add a prefix if this wasn't a complex immediate expression. + InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Start); + else + // Otherwise, rewrite the complex expression as a single immediate. + InstInfo->AsmRewrites->emplace_back(AOK_Imm, Start, Len, Imm); + } + + if (getLexer().isNot(AsmToken::LBrac)) { + // If a directional label (ie. 1f or 2b) was parsed above from + // ParseIntelExpression() then SM.getSym() was set to a pointer to + // to the MCExpr with the directional local symbol and this is a + // memory operand not an immediate operand. + if (SM.getSym()) + return X86Operand::CreateMem(getPointerWidth(), SM.getSym(), Start, End, + Size); + + const MCExpr *ImmExpr = MCConstantExpr::create(Imm, getContext()); + return X86Operand::CreateImm(ImmExpr, Start, End); + } + + // Only positive immediates are valid. + if (Imm < 0) + return ErrorOperand(Start, "expected a positive immediate displacement " + "before bracketed expr."); + + // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ]. + return ParseIntelMemOperand(Imm, Start, Size); + } + + // rounding mode token + if (getSTI().getFeatureBits()[X86::FeatureAVX512] && + getLexer().is(AsmToken::LCurly)) + return ParseRoundingModeOp(Start, End); + + // Register. + unsigned RegNo = 0; + if (!ParseRegister(RegNo, Start, End)) { + // If this is a segment register followed by a ':', then this is the start + // of a segment override, otherwise this is a normal register reference. + // In case it is a normal register and there is ptr in the operand this + // is an error + if (getLexer().isNot(AsmToken::Colon)){ + if (PtrInOperand){ + return ErrorOperand(Start, "expected memory operand after " + "'ptr', found register operand instead"); + } + return X86Operand::CreateReg(RegNo, Start, End); + } + + return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size); + } + + // Memory operand. + return ParseIntelMemOperand(/*Disp=*/0, Start, Size); +} + +std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { + MCAsmParser &Parser = getParser(); + switch (getLexer().getKind()) { + default: + // Parse a memory operand with no segment register. + return ParseMemOperand(0, Parser.getTok().getLoc()); + case AsmToken::Percent: { + // Read the register. + unsigned RegNo; + SMLoc Start, End; + if (ParseRegister(RegNo, Start, End)) return nullptr; + if (RegNo == X86::EIZ || RegNo == X86::RIZ) { + Error(Start, "%eiz and %riz can only be used as index registers", + SMRange(Start, End)); + return nullptr; + } + + // If this is a segment register followed by a ':', then this is the start + // of a memory reference, otherwise this is a normal register reference. + if (getLexer().isNot(AsmToken::Colon)) + return X86Operand::CreateReg(RegNo, Start, End); + + if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo)) + return ErrorOperand(Start, "invalid segment register"); + + getParser().Lex(); // Eat the colon. + return ParseMemOperand(RegNo, Start); + } + case AsmToken::Dollar: { + // $42 -> immediate. + SMLoc Start = Parser.getTok().getLoc(), End; + Parser.Lex(); + const MCExpr *Val; + if (getParser().parseExpression(Val, End)) + return nullptr; + return X86Operand::CreateImm(Val, Start, End); + } + case AsmToken::LCurly:{ + SMLoc Start = Parser.getTok().getLoc(), End; + if (getSTI().getFeatureBits()[X86::FeatureAVX512]) + return ParseRoundingModeOp(Start, End); + return ErrorOperand(Start, "unknown token in expression"); + } + } +} + +bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, + const MCParsedAsmOperand &Op) { + MCAsmParser &Parser = getParser(); + if(getSTI().getFeatureBits()[X86::FeatureAVX512]) { + if (getLexer().is(AsmToken::LCurly)) { + // Eat "{" and mark the current place. + const SMLoc consumedToken = consumeToken(); + // Distinguish {1to<NUM>} from {%k<NUM>}. + if(getLexer().is(AsmToken::Integer)) { + // Parse memory broadcasting ({1to<NUM>}). + if (getLexer().getTok().getIntVal() != 1) + return !ErrorAndEatStatement(getLexer().getLoc(), + "Expected 1to<NUM> at this point"); + Parser.Lex(); // Eat "1" of 1to8 + if (!getLexer().is(AsmToken::Identifier) || + !getLexer().getTok().getIdentifier().startswith("to")) + return !ErrorAndEatStatement(getLexer().getLoc(), + "Expected 1to<NUM> at this point"); + // Recognize only reasonable suffixes. + const char *BroadcastPrimitive = + StringSwitch<const char*>(getLexer().getTok().getIdentifier()) + .Case("to2", "{1to2}") + .Case("to4", "{1to4}") + .Case("to8", "{1to8}") + .Case("to16", "{1to16}") + .Default(nullptr); + if (!BroadcastPrimitive) + return !ErrorAndEatStatement(getLexer().getLoc(), + "Invalid memory broadcast primitive."); + Parser.Lex(); // Eat "toN" of 1toN + if (!getLexer().is(AsmToken::RCurly)) + return !ErrorAndEatStatement(getLexer().getLoc(), + "Expected } at this point"); + Parser.Lex(); // Eat "}" + Operands.push_back(X86Operand::CreateToken(BroadcastPrimitive, + consumedToken)); + // No AVX512 specific primitives can pass + // after memory broadcasting, so return. + return true; + } else { + // Parse mask register {%k1} + Operands.push_back(X86Operand::CreateToken("{", consumedToken)); + if (std::unique_ptr<X86Operand> Op = ParseOperand()) { + Operands.push_back(std::move(Op)); + if (!getLexer().is(AsmToken::RCurly)) + return !ErrorAndEatStatement(getLexer().getLoc(), + "Expected } at this point"); + Operands.push_back(X86Operand::CreateToken("}", consumeToken())); + + // Parse "zeroing non-masked" semantic {z} + if (getLexer().is(AsmToken::LCurly)) { + Operands.push_back(X86Operand::CreateToken("{z}", consumeToken())); + if (!getLexer().is(AsmToken::Identifier) || + getLexer().getTok().getIdentifier() != "z") + return !ErrorAndEatStatement(getLexer().getLoc(), + "Expected z at this point"); + Parser.Lex(); // Eat the z + if (!getLexer().is(AsmToken::RCurly)) + return !ErrorAndEatStatement(getLexer().getLoc(), + "Expected } at this point"); + Parser.Lex(); // Eat the } + } + } + } + } + } + return true; +} + +/// ParseMemOperand: segment: disp(basereg, indexreg, scale). The '%ds:' prefix +/// has already been parsed if present. +std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg, + SMLoc MemStart) { + + MCAsmParser &Parser = getParser(); + // We have to disambiguate a parenthesized expression "(4+5)" from the start + // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)". The + // only way to do this without lookahead is to eat the '(' and see what is + // after it. + const MCExpr *Disp = MCConstantExpr::create(0, getParser().getContext()); + if (getLexer().isNot(AsmToken::LParen)) { + SMLoc ExprEnd; + if (getParser().parseExpression(Disp, ExprEnd)) return nullptr; + + // After parsing the base expression we could either have a parenthesized + // memory address or not. If not, return now. If so, eat the (. + if (getLexer().isNot(AsmToken::LParen)) { + // Unless we have a segment register, treat this as an immediate. + if (SegReg == 0) + return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, ExprEnd); + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1, + MemStart, ExprEnd); + } + + // Eat the '('. + Parser.Lex(); + } else { + // Okay, we have a '('. We don't know if this is an expression or not, but + // so we have to eat the ( to see beyond it. + SMLoc LParenLoc = Parser.getTok().getLoc(); + Parser.Lex(); // Eat the '('. + + if (getLexer().is(AsmToken::Percent) || getLexer().is(AsmToken::Comma)) { + // Nothing to do here, fall into the code below with the '(' part of the + // memory operand consumed. + } else { + SMLoc ExprEnd; + + // It must be an parenthesized expression, parse it now. + if (getParser().parseParenExpression(Disp, ExprEnd)) + return nullptr; + + // After parsing the base expression we could either have a parenthesized + // memory address or not. If not, return now. If so, eat the (. + if (getLexer().isNot(AsmToken::LParen)) { + // Unless we have a segment register, treat this as an immediate. + if (SegReg == 0) + return X86Operand::CreateMem(getPointerWidth(), Disp, LParenLoc, + ExprEnd); + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1, + MemStart, ExprEnd); + } + + // Eat the '('. + Parser.Lex(); + } + } + + // If we reached here, then we just ate the ( of the memory operand. Process + // the rest of the memory operand. + unsigned BaseReg = 0, IndexReg = 0, Scale = 1; + SMLoc IndexLoc, BaseLoc; + + if (getLexer().is(AsmToken::Percent)) { + SMLoc StartLoc, EndLoc; + BaseLoc = Parser.getTok().getLoc(); + if (ParseRegister(BaseReg, StartLoc, EndLoc)) return nullptr; + if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) { + Error(StartLoc, "eiz and riz can only be used as index registers", + SMRange(StartLoc, EndLoc)); + return nullptr; + } + } + + if (getLexer().is(AsmToken::Comma)) { + Parser.Lex(); // Eat the comma. + IndexLoc = Parser.getTok().getLoc(); + + // Following the comma we should have either an index register, or a scale + // value. We don't support the later form, but we want to parse it + // correctly. + // + // Not that even though it would be completely consistent to support syntax + // like "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this. + if (getLexer().is(AsmToken::Percent)) { + SMLoc L; + if (ParseRegister(IndexReg, L, L)) return nullptr; + + if (getLexer().isNot(AsmToken::RParen)) { + // Parse the scale amount: + // ::= ',' [scale-expression] + if (getLexer().isNot(AsmToken::Comma)) { + Error(Parser.getTok().getLoc(), + "expected comma in scale expression"); + return nullptr; + } + Parser.Lex(); // Eat the comma. + + if (getLexer().isNot(AsmToken::RParen)) { + SMLoc Loc = Parser.getTok().getLoc(); + + int64_t ScaleVal; + if (getParser().parseAbsoluteExpression(ScaleVal)){ + Error(Loc, "expected scale expression"); + return nullptr; + } + + // Validate the scale amount. + if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) && + ScaleVal != 1) { + Error(Loc, "scale factor in 16-bit address must be 1"); + return nullptr; + } + if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && + ScaleVal != 8) { + Error(Loc, "scale factor in address must be 1, 2, 4 or 8"); + return nullptr; + } + Scale = (unsigned)ScaleVal; + } + } + } else if (getLexer().isNot(AsmToken::RParen)) { + // A scale amount without an index is ignored. + // index. + SMLoc Loc = Parser.getTok().getLoc(); + + int64_t Value; + if (getParser().parseAbsoluteExpression(Value)) + return nullptr; + + if (Value != 1) + Warning(Loc, "scale factor without index register is ignored"); + Scale = 1; + } + } + + // Ok, we've eaten the memory operand, verify we have a ')' and eat it too. + if (getLexer().isNot(AsmToken::RParen)) { + Error(Parser.getTok().getLoc(), "unexpected token in memory operand"); + return nullptr; + } + SMLoc MemEnd = Parser.getTok().getEndLoc(); + Parser.Lex(); // Eat the ')'. + + // Check for use of invalid 16-bit registers. Only BX/BP/SI/DI are allowed, + // and then only in non-64-bit modes. Except for DX, which is a special case + // because an unofficial form of in/out instructions uses it. + if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) && + (is64BitMode() || (BaseReg != X86::BX && BaseReg != X86::BP && + BaseReg != X86::SI && BaseReg != X86::DI)) && + BaseReg != X86::DX) { + Error(BaseLoc, "invalid 16-bit base register"); + return nullptr; + } + if (BaseReg == 0 && + X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) { + Error(IndexLoc, "16-bit memory operand may not include only index register"); + return nullptr; + } + + StringRef ErrMsg; + if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) { + Error(BaseLoc, ErrMsg); + return nullptr; + } + + if (SegReg || BaseReg || IndexReg) + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg, + IndexReg, Scale, MemStart, MemEnd); + return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, MemEnd); +} + +bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + InstInfo = &Info; + StringRef PatchedName = Name; + + // FIXME: Hack to recognize setneb as setne. + if (PatchedName.startswith("set") && PatchedName.endswith("b") && + PatchedName != "setb" && PatchedName != "setnb") + PatchedName = PatchedName.substr(0, Name.size()-1); + + // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}. + if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) && + (PatchedName.endswith("ss") || PatchedName.endswith("sd") || + PatchedName.endswith("ps") || PatchedName.endswith("pd"))) { + bool IsVCMP = PatchedName[0] == 'v'; + unsigned CCIdx = IsVCMP ? 4 : 3; + unsigned ComparisonCode = StringSwitch<unsigned>( + PatchedName.slice(CCIdx, PatchedName.size() - 2)) + .Case("eq", 0x00) + .Case("lt", 0x01) + .Case("le", 0x02) + .Case("unord", 0x03) + .Case("neq", 0x04) + .Case("nlt", 0x05) + .Case("nle", 0x06) + .Case("ord", 0x07) + /* AVX only from here */ + .Case("eq_uq", 0x08) + .Case("nge", 0x09) + .Case("ngt", 0x0A) + .Case("false", 0x0B) + .Case("neq_oq", 0x0C) + .Case("ge", 0x0D) + .Case("gt", 0x0E) + .Case("true", 0x0F) + .Case("eq_os", 0x10) + .Case("lt_oq", 0x11) + .Case("le_oq", 0x12) + .Case("unord_s", 0x13) + .Case("neq_us", 0x14) + .Case("nlt_uq", 0x15) + .Case("nle_uq", 0x16) + .Case("ord_s", 0x17) + .Case("eq_us", 0x18) + .Case("nge_uq", 0x19) + .Case("ngt_uq", 0x1A) + .Case("false_os", 0x1B) + .Case("neq_os", 0x1C) + .Case("ge_oq", 0x1D) + .Case("gt_oq", 0x1E) + .Case("true_us", 0x1F) + .Default(~0U); + if (ComparisonCode != ~0U && (IsVCMP || ComparisonCode < 8)) { + + Operands.push_back(X86Operand::CreateToken(PatchedName.slice(0, CCIdx), + NameLoc)); + + const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode, + getParser().getContext()); + Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc)); + + PatchedName = PatchedName.substr(PatchedName.size() - 2); + } + } + + // FIXME: Hack to recognize vpcmp<comparison code>{ub,uw,ud,uq,b,w,d,q}. + if (PatchedName.startswith("vpcmp") && + (PatchedName.endswith("b") || PatchedName.endswith("w") || + PatchedName.endswith("d") || PatchedName.endswith("q"))) { + unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1; + unsigned ComparisonCode = StringSwitch<unsigned>( + PatchedName.slice(5, PatchedName.size() - CCIdx)) + .Case("eq", 0x0) // Only allowed on unsigned. Checked below. + .Case("lt", 0x1) + .Case("le", 0x2) + //.Case("false", 0x3) // Not a documented alias. + .Case("neq", 0x4) + .Case("nlt", 0x5) + .Case("nle", 0x6) + //.Case("true", 0x7) // Not a documented alias. + .Default(~0U); + if (ComparisonCode != ~0U && (ComparisonCode != 0 || CCIdx == 2)) { + Operands.push_back(X86Operand::CreateToken("vpcmp", NameLoc)); + + const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode, + getParser().getContext()); + Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc)); + + PatchedName = PatchedName.substr(PatchedName.size() - CCIdx); + } + } + + // FIXME: Hack to recognize vpcom<comparison code>{ub,uw,ud,uq,b,w,d,q}. + if (PatchedName.startswith("vpcom") && + (PatchedName.endswith("b") || PatchedName.endswith("w") || + PatchedName.endswith("d") || PatchedName.endswith("q"))) { + unsigned CCIdx = PatchedName.drop_back().back() == 'u' ? 2 : 1; + unsigned ComparisonCode = StringSwitch<unsigned>( + PatchedName.slice(5, PatchedName.size() - CCIdx)) + .Case("lt", 0x0) + .Case("le", 0x1) + .Case("gt", 0x2) + .Case("ge", 0x3) + .Case("eq", 0x4) + .Case("neq", 0x5) + .Case("false", 0x6) + .Case("true", 0x7) + .Default(~0U); + if (ComparisonCode != ~0U) { + Operands.push_back(X86Operand::CreateToken("vpcom", NameLoc)); + + const MCExpr *ImmOp = MCConstantExpr::create(ComparisonCode, + getParser().getContext()); + Operands.push_back(X86Operand::CreateImm(ImmOp, NameLoc, NameLoc)); + + PatchedName = PatchedName.substr(PatchedName.size() - CCIdx); + } + } + + Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc)); + + // Determine whether this is an instruction prefix. + bool isPrefix = + Name == "lock" || Name == "rep" || + Name == "repe" || Name == "repz" || + Name == "repne" || Name == "repnz" || + Name == "rex64" || Name == "data16"; + + // This does the actual operand parsing. Don't parse any more if we have a + // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we + // just want to parse the "lock" as the first instruction and the "incl" as + // the next one. + if (getLexer().isNot(AsmToken::EndOfStatement) && !isPrefix) { + + // Parse '*' modifier. + if (getLexer().is(AsmToken::Star)) + Operands.push_back(X86Operand::CreateToken("*", consumeToken())); + + // Read the operands. + while(1) { + if (std::unique_ptr<X86Operand> Op = ParseOperand()) { + Operands.push_back(std::move(Op)); + if (!HandleAVX512Operand(Operands, *Operands.back())) + return true; + } else { + Parser.eatToEndOfStatement(); + return true; + } + // check for comma and eat it + if (getLexer().is(AsmToken::Comma)) + Parser.Lex(); + else + break; + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return ErrorAndEatStatement(getLexer().getLoc(), + "unexpected token in argument list"); + } + + // Consume the EndOfStatement or the prefix separator Slash + if (getLexer().is(AsmToken::EndOfStatement) || + (isPrefix && getLexer().is(AsmToken::Slash))) + Parser.Lex(); + + // This is for gas compatibility and cannot be done in td. + // Adding "p" for some floating point with no argument. + // For example: fsub --> fsubp + bool IsFp = + Name == "fsub" || Name == "fdiv" || Name == "fsubr" || Name == "fdivr"; + if (IsFp && Operands.size() == 1) { + const char *Repl = StringSwitch<const char *>(Name) + .Case("fsub", "fsubp") + .Case("fdiv", "fdivp") + .Case("fsubr", "fsubrp") + .Case("fdivr", "fdivrp"); + static_cast<X86Operand &>(*Operands[0]).setTokenValue(Repl); + } + + // This is a terrible hack to handle "out[bwl]? %al, (%dx)" -> + // "outb %al, %dx". Out doesn't take a memory form, but this is a widely + // documented form in various unofficial manuals, so a lot of code uses it. + if ((Name == "outb" || Name == "outw" || Name == "outl" || Name == "out") && + Operands.size() == 3) { + X86Operand &Op = (X86Operand &)*Operands.back(); + if (Op.isMem() && Op.Mem.SegReg == 0 && + isa<MCConstantExpr>(Op.Mem.Disp) && + cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 && + Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) { + SMLoc Loc = Op.getEndLoc(); + Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc); + } + } + // Same hack for "in[bwl]? (%dx), %al" -> "inb %dx, %al". + if ((Name == "inb" || Name == "inw" || Name == "inl" || Name == "in") && + Operands.size() == 3) { + X86Operand &Op = (X86Operand &)*Operands[1]; + if (Op.isMem() && Op.Mem.SegReg == 0 && + isa<MCConstantExpr>(Op.Mem.Disp) && + cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 && + Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) { + SMLoc Loc = Op.getEndLoc(); + Operands[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc); + } + } + + // Append default arguments to "ins[bwld]" + if (Name.startswith("ins") && Operands.size() == 1 && + (Name == "insb" || Name == "insw" || Name == "insl" || Name == "insd")) { + AddDefaultSrcDestOperands(Operands, + X86Operand::CreateReg(X86::DX, NameLoc, NameLoc), + DefaultMemDIOperand(NameLoc)); + } + + // Append default arguments to "outs[bwld]" + if (Name.startswith("outs") && Operands.size() == 1 && + (Name == "outsb" || Name == "outsw" || Name == "outsl" || + Name == "outsd" )) { + AddDefaultSrcDestOperands(Operands, + DefaultMemSIOperand(NameLoc), + X86Operand::CreateReg(X86::DX, NameLoc, NameLoc)); + } + + // Transform "lods[bwlq]" into "lods[bwlq] ($SIREG)" for appropriate + // values of $SIREG according to the mode. It would be nice if this + // could be achieved with InstAlias in the tables. + if (Name.startswith("lods") && Operands.size() == 1 && + (Name == "lods" || Name == "lodsb" || Name == "lodsw" || + Name == "lodsl" || Name == "lodsd" || Name == "lodsq")) + Operands.push_back(DefaultMemSIOperand(NameLoc)); + + // Transform "stos[bwlq]" into "stos[bwlq] ($DIREG)" for appropriate + // values of $DIREG according to the mode. It would be nice if this + // could be achieved with InstAlias in the tables. + if (Name.startswith("stos") && Operands.size() == 1 && + (Name == "stos" || Name == "stosb" || Name == "stosw" || + Name == "stosl" || Name == "stosd" || Name == "stosq")) + Operands.push_back(DefaultMemDIOperand(NameLoc)); + + // Transform "scas[bwlq]" into "scas[bwlq] ($DIREG)" for appropriate + // values of $DIREG according to the mode. It would be nice if this + // could be achieved with InstAlias in the tables. + if (Name.startswith("scas") && Operands.size() == 1 && + (Name == "scas" || Name == "scasb" || Name == "scasw" || + Name == "scasl" || Name == "scasd" || Name == "scasq")) + Operands.push_back(DefaultMemDIOperand(NameLoc)); + + // Add default SI and DI operands to "cmps[bwlq]". + if (Name.startswith("cmps") && + (Name == "cmps" || Name == "cmpsb" || Name == "cmpsw" || + Name == "cmpsl" || Name == "cmpsd" || Name == "cmpsq")) { + if (Operands.size() == 1) { + AddDefaultSrcDestOperands(Operands, + DefaultMemDIOperand(NameLoc), + DefaultMemSIOperand(NameLoc)); + } else if (Operands.size() == 3) { + X86Operand &Op = (X86Operand &)*Operands[1]; + X86Operand &Op2 = (X86Operand &)*Operands[2]; + if (!doSrcDstMatch(Op, Op2)) + return Error(Op.getStartLoc(), + "mismatching source and destination index registers"); + } + } + + // Add default SI and DI operands to "movs[bwlq]". + if ((Name.startswith("movs") && + (Name == "movs" || Name == "movsb" || Name == "movsw" || + Name == "movsl" || Name == "movsd" || Name == "movsq")) || + (Name.startswith("smov") && + (Name == "smov" || Name == "smovb" || Name == "smovw" || + Name == "smovl" || Name == "smovd" || Name == "smovq"))) { + if (Operands.size() == 1) { + if (Name == "movsd") + Operands.back() = X86Operand::CreateToken("movsl", NameLoc); + AddDefaultSrcDestOperands(Operands, + DefaultMemSIOperand(NameLoc), + DefaultMemDIOperand(NameLoc)); + } else if (Operands.size() == 3) { + X86Operand &Op = (X86Operand &)*Operands[1]; + X86Operand &Op2 = (X86Operand &)*Operands[2]; + if (!doSrcDstMatch(Op, Op2)) + return Error(Op.getStartLoc(), + "mismatching source and destination index registers"); + } + } + + // FIXME: Hack to handle recognize s{hr,ar,hl} $1, <op>. Canonicalize to + // "shift <op>". + if ((Name.startswith("shr") || Name.startswith("sar") || + Name.startswith("shl") || Name.startswith("sal") || + Name.startswith("rcl") || Name.startswith("rcr") || + Name.startswith("rol") || Name.startswith("ror")) && + Operands.size() == 3) { + if (isParsingIntelSyntax()) { + // Intel syntax + X86Operand &Op1 = static_cast<X86Operand &>(*Operands[2]); + if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) && + cast<MCConstantExpr>(Op1.getImm())->getValue() == 1) + Operands.pop_back(); + } else { + X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]); + if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) && + cast<MCConstantExpr>(Op1.getImm())->getValue() == 1) + Operands.erase(Operands.begin() + 1); + } + } + + // Transforms "int $3" into "int3" as a size optimization. We can't write an + // instalias with an immediate operand yet. + if (Name == "int" && Operands.size() == 2) { + X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]); + if (Op1.isImm()) + if (auto *CE = dyn_cast<MCConstantExpr>(Op1.getImm())) + if (CE->getValue() == 3) { + Operands.erase(Operands.begin() + 1); + static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3"); + } + } + + return false; +} + +bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { + switch (Inst.getOpcode()) { + default: return false; + case X86::VMOVZPQILo2PQIrr: + case X86::VMOVAPDrr: + case X86::VMOVAPDYrr: + case X86::VMOVAPSrr: + case X86::VMOVAPSYrr: + case X86::VMOVDQArr: + case X86::VMOVDQAYrr: + case X86::VMOVDQUrr: + case X86::VMOVDQUYrr: + case X86::VMOVUPDrr: + case X86::VMOVUPDYrr: + case X86::VMOVUPSrr: + case X86::VMOVUPSYrr: { + if (X86II::isX86_64ExtendedReg(Inst.getOperand(0).getReg()) || + !X86II::isX86_64ExtendedReg(Inst.getOperand(1).getReg())) + return false; + + unsigned NewOpc; + switch (Inst.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break; + case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break; + case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break; + case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break; + case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break; + case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break; + case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break; + case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break; + case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break; + case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break; + case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break; + case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break; + case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break; + } + Inst.setOpcode(NewOpc); + return true; + } + case X86::VMOVSDrr: + case X86::VMOVSSrr: { + if (X86II::isX86_64ExtendedReg(Inst.getOperand(0).getReg()) || + !X86II::isX86_64ExtendedReg(Inst.getOperand(2).getReg())) + return false; + unsigned NewOpc; + switch (Inst.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break; + case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break; + } + Inst.setOpcode(NewOpc); + return true; + } + } +} + +static const char *getSubtargetFeatureName(uint64_t Val); + +void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands, + MCStreamer &Out) { + Instrumentation->InstrumentAndEmitInstruction(Inst, Operands, getContext(), + MII, Out); +} + +bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, uint64_t &ErrorInfo, + bool MatchingInlineAsm) { + if (isParsingIntelSyntax()) + return MatchAndEmitIntelInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo, + MatchingInlineAsm); + return MatchAndEmitATTInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo, + MatchingInlineAsm); +} + +void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, + OperandVector &Operands, MCStreamer &Out, + bool MatchingInlineAsm) { + // FIXME: This should be replaced with a real .td file alias mechanism. + // Also, MatchInstructionImpl should actually *do* the EmitInstruction + // call. + const char *Repl = StringSwitch<const char *>(Op.getToken()) + .Case("finit", "fninit") + .Case("fsave", "fnsave") + .Case("fstcw", "fnstcw") + .Case("fstcww", "fnstcw") + .Case("fstenv", "fnstenv") + .Case("fstsw", "fnstsw") + .Case("fstsww", "fnstsw") + .Case("fclex", "fnclex") + .Default(nullptr); + if (Repl) { + MCInst Inst; + Inst.setOpcode(X86::WAIT); + Inst.setLoc(IDLoc); + if (!MatchingInlineAsm) + EmitInstruction(Inst, Operands, Out); + Operands[0] = X86Operand::CreateToken(Repl, IDLoc); + } +} + +bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo, + bool MatchingInlineAsm) { + assert(ErrorInfo && "Unknown missing feature!"); + ArrayRef<SMRange> EmptyRanges = None; + SmallString<126> Msg; + raw_svector_ostream OS(Msg); + OS << "instruction requires:"; + uint64_t Mask = 1; + for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) { + if (ErrorInfo & Mask) + OS << ' ' << getSubtargetFeatureName(ErrorInfo & Mask); + Mask <<= 1; + } + return Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm); +} + +bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) { + assert(!Operands.empty() && "Unexpect empty operand list!"); + X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); + assert(Op.isToken() && "Leading operand should always be a mnemonic!"); + ArrayRef<SMRange> EmptyRanges = None; + + // First, handle aliases that expand to multiple instructions. + MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm); + + bool WasOriginallyInvalidOperand = false; + MCInst Inst; + + // First, try a direct match. + switch (MatchInstructionImpl(Operands, Inst, + ErrorInfo, MatchingInlineAsm, + isParsingIntelSyntax())) { + default: llvm_unreachable("Unexpected match result!"); + case Match_Success: + // Some instructions need post-processing to, for example, tweak which + // encoding is selected. Loop on it while changes happen so the + // individual transformations can chain off each other. + if (!MatchingInlineAsm) + while (processInstruction(Inst, Operands)) + ; + + Inst.setLoc(IDLoc); + if (!MatchingInlineAsm) + EmitInstruction(Inst, Operands, Out); + Opcode = Inst.getOpcode(); + return false; + case Match_MissingFeature: + return ErrorMissingFeature(IDLoc, ErrorInfo, MatchingInlineAsm); + case Match_InvalidOperand: + WasOriginallyInvalidOperand = true; + break; + case Match_MnemonicFail: + break; + } + + // FIXME: Ideally, we would only attempt suffix matches for things which are + // valid prefixes, and we could just infer the right unambiguous + // type. However, that requires substantially more matcher support than the + // following hack. + + // Change the operand to point to a temporary token. + StringRef Base = Op.getToken(); + SmallString<16> Tmp; + Tmp += Base; + Tmp += ' '; + Op.setTokenValue(Tmp); + + // If this instruction starts with an 'f', then it is a floating point stack + // instruction. These come in up to three forms for 32-bit, 64-bit, and + // 80-bit floating point, which use the suffixes s,l,t respectively. + // + // Otherwise, we assume that this may be an integer instruction, which comes + // in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively. + const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0"; + + // Check for the various suffix matches. + uint64_t ErrorInfoIgnore; + uint64_t ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings. + unsigned Match[4]; + + for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) { + Tmp.back() = Suffixes[I]; + Match[I] = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, + MatchingInlineAsm, isParsingIntelSyntax()); + // If this returned as a missing feature failure, remember that. + if (Match[I] == Match_MissingFeature) + ErrorInfoMissingFeature = ErrorInfoIgnore; + } + + // Restore the old token. + Op.setTokenValue(Base); + + // If exactly one matched, then we treat that as a successful match (and the + // instruction will already have been filled in correctly, since the failing + // matches won't have modified it). + unsigned NumSuccessfulMatches = + std::count(std::begin(Match), std::end(Match), Match_Success); + if (NumSuccessfulMatches == 1) { + Inst.setLoc(IDLoc); + if (!MatchingInlineAsm) + EmitInstruction(Inst, Operands, Out); + Opcode = Inst.getOpcode(); + return false; + } + + // Otherwise, the match failed, try to produce a decent error message. + + // If we had multiple suffix matches, then identify this as an ambiguous + // match. + if (NumSuccessfulMatches > 1) { + char MatchChars[4]; + unsigned NumMatches = 0; + for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) + if (Match[I] == Match_Success) + MatchChars[NumMatches++] = Suffixes[I]; + + SmallString<126> Msg; + raw_svector_ostream OS(Msg); + OS << "ambiguous instructions require an explicit suffix (could be "; + for (unsigned i = 0; i != NumMatches; ++i) { + if (i != 0) + OS << ", "; + if (i + 1 == NumMatches) + OS << "or "; + OS << "'" << Base << MatchChars[i] << "'"; + } + OS << ")"; + Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm); + return true; + } + + // Okay, we know that none of the variants matched successfully. + + // If all of the instructions reported an invalid mnemonic, then the original + // mnemonic was invalid. + if (std::count(std::begin(Match), std::end(Match), Match_MnemonicFail) == 4) { + if (!WasOriginallyInvalidOperand) { + ArrayRef<SMRange> Ranges = + MatchingInlineAsm ? EmptyRanges : Op.getLocRange(); + return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'", + Ranges, MatchingInlineAsm); + } + + // Recover location info for the operand if we know which was the problem. + if (ErrorInfo != ~0ULL) { + if (ErrorInfo >= Operands.size()) + return Error(IDLoc, "too few operands for instruction", + EmptyRanges, MatchingInlineAsm); + + X86Operand &Operand = (X86Operand &)*Operands[ErrorInfo]; + if (Operand.getStartLoc().isValid()) { + SMRange OperandRange = Operand.getLocRange(); + return Error(Operand.getStartLoc(), "invalid operand for instruction", + OperandRange, MatchingInlineAsm); + } + } + + return Error(IDLoc, "invalid operand for instruction", EmptyRanges, + MatchingInlineAsm); + } + + // If one instruction matched with a missing feature, report this as a + // missing feature. + if (std::count(std::begin(Match), std::end(Match), + Match_MissingFeature) == 1) { + ErrorInfo = ErrorInfoMissingFeature; + return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature, + MatchingInlineAsm); + } + + // If one instruction matched with an invalid operand, report this as an + // operand failure. + if (std::count(std::begin(Match), std::end(Match), + Match_InvalidOperand) == 1) { + return Error(IDLoc, "invalid operand for instruction", EmptyRanges, + MatchingInlineAsm); + } + + // If all of these were an outright failure, report it in a useless way. + Error(IDLoc, "unknown use of instruction mnemonic without a size suffix", + EmptyRanges, MatchingInlineAsm); + return true; +} + +bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) { + assert(!Operands.empty() && "Unexpect empty operand list!"); + X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); + assert(Op.isToken() && "Leading operand should always be a mnemonic!"); + StringRef Mnemonic = Op.getToken(); + ArrayRef<SMRange> EmptyRanges = None; + + // First, handle aliases that expand to multiple instructions. + MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm); + + MCInst Inst; + + // Find one unsized memory operand, if present. + X86Operand *UnsizedMemOp = nullptr; + for (const auto &Op : Operands) { + X86Operand *X86Op = static_cast<X86Operand *>(Op.get()); + if (X86Op->isMemUnsized()) + UnsizedMemOp = X86Op; + } + + // Allow some instructions to have implicitly pointer-sized operands. This is + // compatible with gas. + if (UnsizedMemOp) { + static const char *const PtrSizedInstrs[] = {"call", "jmp", "push"}; + for (const char *Instr : PtrSizedInstrs) { + if (Mnemonic == Instr) { + UnsizedMemOp->Mem.Size = getPointerWidth(); + break; + } + } + } + + // If an unsized memory operand is present, try to match with each memory + // operand size. In Intel assembly, the size is not part of the instruction + // mnemonic. + SmallVector<unsigned, 8> Match; + uint64_t ErrorInfoMissingFeature = 0; + if (UnsizedMemOp && UnsizedMemOp->isMemUnsized()) { + static const unsigned MopSizes[] = {8, 16, 32, 64, 80, 128, 256, 512}; + for (unsigned Size : MopSizes) { + UnsizedMemOp->Mem.Size = Size; + uint64_t ErrorInfoIgnore; + unsigned LastOpcode = Inst.getOpcode(); + unsigned M = + MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, + MatchingInlineAsm, isParsingIntelSyntax()); + if (Match.empty() || LastOpcode != Inst.getOpcode()) + Match.push_back(M); + + // If this returned as a missing feature failure, remember that. + if (Match.back() == Match_MissingFeature) + ErrorInfoMissingFeature = ErrorInfoIgnore; + } + + // Restore the size of the unsized memory operand if we modified it. + if (UnsizedMemOp) + UnsizedMemOp->Mem.Size = 0; + } + + // If we haven't matched anything yet, this is not a basic integer or FPU + // operation. There shouldn't be any ambiguity in our mnemonic table, so try + // matching with the unsized operand. + if (Match.empty()) { + Match.push_back(MatchInstructionImpl(Operands, Inst, ErrorInfo, + MatchingInlineAsm, + isParsingIntelSyntax())); + // If this returned as a missing feature failure, remember that. + if (Match.back() == Match_MissingFeature) + ErrorInfoMissingFeature = ErrorInfo; + } + + // Restore the size of the unsized memory operand if we modified it. + if (UnsizedMemOp) + UnsizedMemOp->Mem.Size = 0; + + // If it's a bad mnemonic, all results will be the same. + if (Match.back() == Match_MnemonicFail) { + ArrayRef<SMRange> Ranges = + MatchingInlineAsm ? EmptyRanges : Op.getLocRange(); + return Error(IDLoc, "invalid instruction mnemonic '" + Mnemonic + "'", + Ranges, MatchingInlineAsm); + } + + // If exactly one matched, then we treat that as a successful match (and the + // instruction will already have been filled in correctly, since the failing + // matches won't have modified it). + unsigned NumSuccessfulMatches = + std::count(std::begin(Match), std::end(Match), Match_Success); + if (NumSuccessfulMatches == 1) { + // Some instructions need post-processing to, for example, tweak which + // encoding is selected. Loop on it while changes happen so the individual + // transformations can chain off each other. + if (!MatchingInlineAsm) + while (processInstruction(Inst, Operands)) + ; + Inst.setLoc(IDLoc); + if (!MatchingInlineAsm) + EmitInstruction(Inst, Operands, Out); + Opcode = Inst.getOpcode(); + return false; + } else if (NumSuccessfulMatches > 1) { + assert(UnsizedMemOp && + "multiple matches only possible with unsized memory operands"); + ArrayRef<SMRange> Ranges = + MatchingInlineAsm ? EmptyRanges : UnsizedMemOp->getLocRange(); + return Error(UnsizedMemOp->getStartLoc(), + "ambiguous operand size for instruction '" + Mnemonic + "\'", + Ranges, MatchingInlineAsm); + } + + // If one instruction matched with a missing feature, report this as a + // missing feature. + if (std::count(std::begin(Match), std::end(Match), + Match_MissingFeature) == 1) { + ErrorInfo = ErrorInfoMissingFeature; + return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature, + MatchingInlineAsm); + } + + // If one instruction matched with an invalid operand, report this as an + // operand failure. + if (std::count(std::begin(Match), std::end(Match), + Match_InvalidOperand) == 1) { + return Error(IDLoc, "invalid operand for instruction", EmptyRanges, + MatchingInlineAsm); + } + + // If all of these were an outright failure, report it in a useless way. + return Error(IDLoc, "unknown instruction mnemonic", EmptyRanges, + MatchingInlineAsm); +} + +bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) { + return X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo); +} + +bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { + MCAsmParser &Parser = getParser(); + StringRef IDVal = DirectiveID.getIdentifier(); + if (IDVal == ".word") + return ParseDirectiveWord(2, DirectiveID.getLoc()); + else if (IDVal.startswith(".code")) + return ParseDirectiveCode(IDVal, DirectiveID.getLoc()); + else if (IDVal.startswith(".att_syntax")) { + if (getLexer().isNot(AsmToken::EndOfStatement)) { + if (Parser.getTok().getString() == "prefix") + Parser.Lex(); + else if (Parser.getTok().getString() == "noprefix") + return Error(DirectiveID.getLoc(), "'.att_syntax noprefix' is not " + "supported: registers must have a " + "'%' prefix in .att_syntax"); + } + getParser().setAssemblerDialect(0); + return false; + } else if (IDVal.startswith(".intel_syntax")) { + getParser().setAssemblerDialect(1); + if (getLexer().isNot(AsmToken::EndOfStatement)) { + if (Parser.getTok().getString() == "noprefix") + Parser.Lex(); + else if (Parser.getTok().getString() == "prefix") + return Error(DirectiveID.getLoc(), "'.intel_syntax prefix' is not " + "supported: registers must not have " + "a '%' prefix in .intel_syntax"); + } + return false; + } else if (IDVal == ".even") + return parseDirectiveEven(DirectiveID.getLoc()); + return true; +} + +/// parseDirectiveEven +/// ::= .even +bool X86AsmParser::parseDirectiveEven(SMLoc L) { + const MCSection *Section = getStreamer().getCurrentSection().first; + if (getLexer().isNot(AsmToken::EndOfStatement)) { + TokError("unexpected token in directive"); + return false; + } + if (!Section) { + getStreamer().InitSections(false); + Section = getStreamer().getCurrentSection().first; + } + if (Section->UseCodeAlign()) + getStreamer().EmitCodeAlignment(2, 0); + else + getStreamer().EmitValueToAlignment(2, 0, 1, 0); + return false; +} +/// ParseDirectiveWord +/// ::= .word [ expression (, expression)* ] +bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { + MCAsmParser &Parser = getParser(); + if (getLexer().isNot(AsmToken::EndOfStatement)) { + for (;;) { + const MCExpr *Value; + SMLoc ExprLoc = getLexer().getLoc(); + if (getParser().parseExpression(Value)) + return false; + + if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) { + assert(Size <= 8 && "Invalid size"); + uint64_t IntValue = MCE->getValue(); + if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue)) + return Error(ExprLoc, "literal value out of range for directive"); + getStreamer().EmitIntValue(IntValue, Size); + } else { + getStreamer().EmitValue(Value, Size, ExprLoc); + } + + if (getLexer().is(AsmToken::EndOfStatement)) + break; + + // FIXME: Improve diagnostic. + if (getLexer().isNot(AsmToken::Comma)) { + Error(L, "unexpected token in directive"); + return false; + } + Parser.Lex(); + } + } + + Parser.Lex(); + return false; +} + +/// ParseDirectiveCode +/// ::= .code16 | .code32 | .code64 +bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) { + MCAsmParser &Parser = getParser(); + if (IDVal == ".code16") { + Parser.Lex(); + if (!is16BitMode()) { + SwitchMode(X86::Mode16Bit); + getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16); + } + } else if (IDVal == ".code32") { + Parser.Lex(); + if (!is32BitMode()) { + SwitchMode(X86::Mode32Bit); + getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32); + } + } else if (IDVal == ".code64") { + Parser.Lex(); + if (!is64BitMode()) { + SwitchMode(X86::Mode64Bit); + getParser().getStreamer().EmitAssemblerFlag(MCAF_Code64); + } + } else { + Error(L, "unknown directive " + IDVal); + return false; + } + + return false; +} + +// Force static initialization. +extern "C" void LLVMInitializeX86AsmParser() { + RegisterMCAsmParser<X86AsmParser> X(TheX86_32Target); + RegisterMCAsmParser<X86AsmParser> Y(TheX86_64Target); +} + +#define GET_REGISTER_MATCHER +#define GET_MATCHER_IMPLEMENTATION +#define GET_SUBTARGET_FEATURE_NAME +#include "X86GenAsmMatcher.inc" diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h new file mode 100644 index 0000000..54538c8 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h @@ -0,0 +1,39 @@ +//===-- X86AsmParserCommon.h - Common functions for X86AsmParser ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H +#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H + +namespace llvm { + +inline bool isImmSExti16i8Value(uint64_t Value) { + return isInt<8>(Value) || + (isUInt<16>(Value) && isInt<8>(static_cast<int16_t>(Value))); +} + +inline bool isImmSExti32i8Value(uint64_t Value) { + return isInt<8>(Value) || + (isUInt<32>(Value) && isInt<8>(static_cast<int32_t>(Value))); +} + +inline bool isImmSExti64i8Value(uint64_t Value) { + return isInt<8>(Value); +} + +inline bool isImmSExti64i32Value(uint64_t Value) { + return isInt<32>(Value); +} + +inline bool isImmUnsignedi8Value(uint64_t Value) { + return isUInt<8>(Value) || isInt<8>(Value); +} + +} // End of namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h new file mode 100644 index 0000000..7ec0240 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h @@ -0,0 +1,543 @@ +//===-- X86Operand.h - Parsed X86 machine instruction --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H +#define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H + +#include "X86AsmParserCommon.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/ADT/STLExtras.h" +#include "MCTargetDesc/X86MCTargetDesc.h" + +namespace llvm { + +/// X86Operand - Instances of this class represent a parsed X86 machine +/// instruction. +struct X86Operand : public MCParsedAsmOperand { + enum KindTy { + Token, + Register, + Immediate, + Memory + } Kind; + + SMLoc StartLoc, EndLoc; + SMLoc OffsetOfLoc; + StringRef SymName; + void *OpDecl; + bool AddressOf; + + struct TokOp { + const char *Data; + unsigned Length; + }; + + struct RegOp { + unsigned RegNo; + }; + + struct ImmOp { + const MCExpr *Val; + }; + + struct MemOp { + unsigned SegReg; + const MCExpr *Disp; + unsigned BaseReg; + unsigned IndexReg; + unsigned Scale; + unsigned Size; + unsigned ModeSize; + }; + + union { + struct TokOp Tok; + struct RegOp Reg; + struct ImmOp Imm; + struct MemOp Mem; + }; + + X86Operand(KindTy K, SMLoc Start, SMLoc End) + : Kind(K), StartLoc(Start), EndLoc(End) {} + + StringRef getSymName() override { return SymName; } + void *getOpDecl() override { return OpDecl; } + + /// getStartLoc - Get the location of the first token of this operand. + SMLoc getStartLoc() const override { return StartLoc; } + /// getEndLoc - Get the location of the last token of this operand. + SMLoc getEndLoc() const override { return EndLoc; } + /// getLocRange - Get the range between the first and last token of this + /// operand. + SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); } + /// getOffsetOfLoc - Get the location of the offset operator. + SMLoc getOffsetOfLoc() const override { return OffsetOfLoc; } + + void print(raw_ostream &OS) const override {} + + StringRef getToken() const { + assert(Kind == Token && "Invalid access!"); + return StringRef(Tok.Data, Tok.Length); + } + void setTokenValue(StringRef Value) { + assert(Kind == Token && "Invalid access!"); + Tok.Data = Value.data(); + Tok.Length = Value.size(); + } + + unsigned getReg() const override { + assert(Kind == Register && "Invalid access!"); + return Reg.RegNo; + } + + const MCExpr *getImm() const { + assert(Kind == Immediate && "Invalid access!"); + return Imm.Val; + } + + const MCExpr *getMemDisp() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.Disp; + } + unsigned getMemSegReg() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.SegReg; + } + unsigned getMemBaseReg() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.BaseReg; + } + unsigned getMemIndexReg() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.IndexReg; + } + unsigned getMemScale() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.Scale; + } + unsigned getMemModeSize() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.ModeSize; + } + + bool isToken() const override {return Kind == Token; } + + bool isImm() const override { return Kind == Immediate; } + + bool isImmSExti16i8() const { + if (!isImm()) + return false; + + // If this isn't a constant expr, just assume it fits and let relaxation + // handle it. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) + return true; + + // Otherwise, check the value is in a range that makes sense for this + // extension. + return isImmSExti16i8Value(CE->getValue()); + } + bool isImmSExti32i8() const { + if (!isImm()) + return false; + + // If this isn't a constant expr, just assume it fits and let relaxation + // handle it. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) + return true; + + // Otherwise, check the value is in a range that makes sense for this + // extension. + return isImmSExti32i8Value(CE->getValue()); + } + bool isImmSExti64i8() const { + if (!isImm()) + return false; + + // If this isn't a constant expr, just assume it fits and let relaxation + // handle it. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) + return true; + + // Otherwise, check the value is in a range that makes sense for this + // extension. + return isImmSExti64i8Value(CE->getValue()); + } + bool isImmSExti64i32() const { + if (!isImm()) + return false; + + // If this isn't a constant expr, just assume it fits and let relaxation + // handle it. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) + return true; + + // Otherwise, check the value is in a range that makes sense for this + // extension. + return isImmSExti64i32Value(CE->getValue()); + } + + bool isImmUnsignedi8() const { + if (!isImm()) return false; + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + return isImmUnsignedi8Value(CE->getValue()); + } + + bool isOffsetOf() const override { + return OffsetOfLoc.getPointer(); + } + + bool needAddressOf() const override { + return AddressOf; + } + + bool isMem() const override { return Kind == Memory; } + bool isMemUnsized() const { + return Kind == Memory && Mem.Size == 0; + } + bool isMem8() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 8); + } + bool isMem16() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 16); + } + bool isMem32() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 32); + } + bool isMem64() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 64); + } + bool isMem80() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 80); + } + bool isMem128() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 128); + } + bool isMem256() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 256); + } + bool isMem512() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 512); + } + + bool isMemVX32() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 32) && + getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15; + } + bool isMemVX32X() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 32) && + getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM31; + } + bool isMemVY32() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 32) && + getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15; + } + bool isMemVY32X() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 32) && + getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM31; + } + bool isMemVX64() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 64) && + getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15; + } + bool isMemVX64X() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 64) && + getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM31; + } + bool isMemVY64() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 64) && + getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15; + } + bool isMemVY64X() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 64) && + getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM31; + } + bool isMemVZ32() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 32) && + getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31; + } + bool isMemVZ64() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 64) && + getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31; + } + + bool isAbsMem() const { + return Kind == Memory && !getMemSegReg() && !getMemBaseReg() && + !getMemIndexReg() && getMemScale() == 1; + } + bool isAVX512RC() const{ + return isImm(); + } + + bool isAbsMem16() const { + return isAbsMem() && Mem.ModeSize == 16; + } + + bool isSrcIdx() const { + return !getMemIndexReg() && getMemScale() == 1 && + (getMemBaseReg() == X86::RSI || getMemBaseReg() == X86::ESI || + getMemBaseReg() == X86::SI) && isa<MCConstantExpr>(getMemDisp()) && + cast<MCConstantExpr>(getMemDisp())->getValue() == 0; + } + bool isSrcIdx8() const { + return isMem8() && isSrcIdx(); + } + bool isSrcIdx16() const { + return isMem16() && isSrcIdx(); + } + bool isSrcIdx32() const { + return isMem32() && isSrcIdx(); + } + bool isSrcIdx64() const { + return isMem64() && isSrcIdx(); + } + + bool isDstIdx() const { + return !getMemIndexReg() && getMemScale() == 1 && + (getMemSegReg() == 0 || getMemSegReg() == X86::ES) && + (getMemBaseReg() == X86::RDI || getMemBaseReg() == X86::EDI || + getMemBaseReg() == X86::DI) && isa<MCConstantExpr>(getMemDisp()) && + cast<MCConstantExpr>(getMemDisp())->getValue() == 0; + } + bool isDstIdx8() const { + return isMem8() && isDstIdx(); + } + bool isDstIdx16() const { + return isMem16() && isDstIdx(); + } + bool isDstIdx32() const { + return isMem32() && isDstIdx(); + } + bool isDstIdx64() const { + return isMem64() && isDstIdx(); + } + + bool isMemOffs() const { + return Kind == Memory && !getMemBaseReg() && !getMemIndexReg() && + getMemScale() == 1; + } + + bool isMemOffs16_8() const { + return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 8); + } + bool isMemOffs16_16() const { + return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 16); + } + bool isMemOffs16_32() const { + return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 32); + } + bool isMemOffs32_8() const { + return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 8); + } + bool isMemOffs32_16() const { + return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 16); + } + bool isMemOffs32_32() const { + return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 32); + } + bool isMemOffs32_64() const { + return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 64); + } + bool isMemOffs64_8() const { + return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 8); + } + bool isMemOffs64_16() const { + return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 16); + } + bool isMemOffs64_32() const { + return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 32); + } + bool isMemOffs64_64() const { + return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 64); + } + + bool isReg() const override { return Kind == Register; } + + bool isGR32orGR64() const { + return Kind == Register && + (X86MCRegisterClasses[X86::GR32RegClassID].contains(getReg()) || + X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg())); + } + + void addExpr(MCInst &Inst, const MCExpr *Expr) const { + // Add as immediates when possible. + if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr)) + Inst.addOperand(MCOperand::createImm(CE->getValue())); + else + Inst.addOperand(MCOperand::createExpr(Expr)); + } + + void addRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(getReg())); + } + + static unsigned getGR32FromGR64(unsigned RegNo) { + switch (RegNo) { + default: llvm_unreachable("Unexpected register"); + case X86::RAX: return X86::EAX; + case X86::RCX: return X86::ECX; + case X86::RDX: return X86::EDX; + case X86::RBX: return X86::EBX; + case X86::RBP: return X86::EBP; + case X86::RSP: return X86::ESP; + case X86::RSI: return X86::ESI; + case X86::RDI: return X86::EDI; + case X86::R8: return X86::R8D; + case X86::R9: return X86::R9D; + case X86::R10: return X86::R10D; + case X86::R11: return X86::R11D; + case X86::R12: return X86::R12D; + case X86::R13: return X86::R13D; + case X86::R14: return X86::R14D; + case X86::R15: return X86::R15D; + case X86::RIP: return X86::EIP; + } + } + + void addGR32orGR64Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + unsigned RegNo = getReg(); + if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo)) + RegNo = getGR32FromGR64(RegNo); + Inst.addOperand(MCOperand::createReg(RegNo)); + } + void addAVX512RCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); + } + void addImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); + } + + void addMemOperands(MCInst &Inst, unsigned N) const { + assert((N == 5) && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(getMemBaseReg())); + Inst.addOperand(MCOperand::createImm(getMemScale())); + Inst.addOperand(MCOperand::createReg(getMemIndexReg())); + addExpr(Inst, getMemDisp()); + Inst.addOperand(MCOperand::createReg(getMemSegReg())); + } + + void addAbsMemOperands(MCInst &Inst, unsigned N) const { + assert((N == 1) && "Invalid number of operands!"); + // Add as immediates when possible. + if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp())) + Inst.addOperand(MCOperand::createImm(CE->getValue())); + else + Inst.addOperand(MCOperand::createExpr(getMemDisp())); + } + + void addSrcIdxOperands(MCInst &Inst, unsigned N) const { + assert((N == 2) && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(getMemBaseReg())); + Inst.addOperand(MCOperand::createReg(getMemSegReg())); + } + void addDstIdxOperands(MCInst &Inst, unsigned N) const { + assert((N == 1) && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(getMemBaseReg())); + } + + void addMemOffsOperands(MCInst &Inst, unsigned N) const { + assert((N == 2) && "Invalid number of operands!"); + // Add as immediates when possible. + if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp())) + Inst.addOperand(MCOperand::createImm(CE->getValue())); + else + Inst.addOperand(MCOperand::createExpr(getMemDisp())); + Inst.addOperand(MCOperand::createReg(getMemSegReg())); + } + + static std::unique_ptr<X86Operand> CreateToken(StringRef Str, SMLoc Loc) { + SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size()); + auto Res = llvm::make_unique<X86Operand>(Token, Loc, EndLoc); + Res->Tok.Data = Str.data(); + Res->Tok.Length = Str.size(); + return Res; + } + + static std::unique_ptr<X86Operand> + CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc, + bool AddressOf = false, SMLoc OffsetOfLoc = SMLoc(), + StringRef SymName = StringRef(), void *OpDecl = nullptr) { + auto Res = llvm::make_unique<X86Operand>(Register, StartLoc, EndLoc); + Res->Reg.RegNo = RegNo; + Res->AddressOf = AddressOf; + Res->OffsetOfLoc = OffsetOfLoc; + Res->SymName = SymName; + Res->OpDecl = OpDecl; + return Res; + } + + static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val, + SMLoc StartLoc, SMLoc EndLoc) { + auto Res = llvm::make_unique<X86Operand>(Immediate, StartLoc, EndLoc); + Res->Imm.Val = Val; + return Res; + } + + /// Create an absolute memory operand. + static std::unique_ptr<X86Operand> + CreateMem(unsigned ModeSize, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, + unsigned Size = 0, StringRef SymName = StringRef(), + void *OpDecl = nullptr) { + auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc); + Res->Mem.SegReg = 0; + Res->Mem.Disp = Disp; + Res->Mem.BaseReg = 0; + Res->Mem.IndexReg = 0; + Res->Mem.Scale = 1; + Res->Mem.Size = Size; + Res->Mem.ModeSize = ModeSize; + Res->SymName = SymName; + Res->OpDecl = OpDecl; + Res->AddressOf = false; + return Res; + } + + /// Create a generalized memory operand. + static std::unique_ptr<X86Operand> + CreateMem(unsigned ModeSize, unsigned SegReg, const MCExpr *Disp, + unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc StartLoc, + SMLoc EndLoc, unsigned Size = 0, StringRef SymName = StringRef(), + void *OpDecl = nullptr) { + // We should never just have a displacement, that should be parsed as an + // absolute memory operand. + assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!"); + + // The scale should always be one of {1,2,4,8}. + assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) && + "Invalid scale!"); + auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc); + Res->Mem.SegReg = SegReg; + Res->Mem.Disp = Disp; + Res->Mem.BaseReg = BaseReg; + Res->Mem.IndexReg = IndexReg; + Res->Mem.Scale = Scale; + Res->Mem.Size = Size; + Res->Mem.ModeSize = ModeSize; + Res->SymName = SymName; + Res->OpDecl = OpDecl; + Res->AddressOf = false; + return Res; + } +}; + +} // End of namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp new file mode 100644 index 0000000..ce8fcf1 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -0,0 +1,1009 @@ +//===-- X86Disassembler.cpp - Disassembler for x86 and x86_64 -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is part of the X86 Disassembler. +// It contains code to translate the data produced by the decoder into +// MCInsts. +// Documentation for the disassembler can be found in X86Disassembler.h. +// +//===----------------------------------------------------------------------===// + +#include "X86Disassembler.h" +#include "X86DisassemblerDecoder.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using namespace llvm::X86Disassembler; + +#define DEBUG_TYPE "x86-disassembler" + +#define GET_REGINFO_ENUM +#include "X86GenRegisterInfo.inc" +#define GET_INSTRINFO_ENUM +#include "X86GenInstrInfo.inc" +#define GET_SUBTARGETINFO_ENUM +#include "X86GenSubtargetInfo.inc" + +void llvm::X86Disassembler::Debug(const char *file, unsigned line, + const char *s) { + dbgs() << file << ":" << line << ": " << s; +} + +const char *llvm::X86Disassembler::GetInstrName(unsigned Opcode, + const void *mii) { + const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii); + return MII->getName(Opcode); +} + +#define debug(s) DEBUG(Debug(__FILE__, __LINE__, s)); + +namespace llvm { + +// Fill-ins to make the compiler happy. These constants are never actually +// assigned; they are just filler to make an automatically-generated switch +// statement work. +namespace X86 { + enum { + BX_SI = 500, + BX_DI = 501, + BP_SI = 502, + BP_DI = 503, + sib = 504, + sib64 = 505 + }; +} + +extern Target TheX86_32Target, TheX86_64Target; + +} + +static bool translateInstruction(MCInst &target, + InternalInstruction &source, + const MCDisassembler *Dis); + +X86GenericDisassembler::X86GenericDisassembler( + const MCSubtargetInfo &STI, + MCContext &Ctx, + std::unique_ptr<const MCInstrInfo> MII) + : MCDisassembler(STI, Ctx), MII(std::move(MII)) { + const FeatureBitset &FB = STI.getFeatureBits(); + if (FB[X86::Mode16Bit]) { + fMode = MODE_16BIT; + return; + } else if (FB[X86::Mode32Bit]) { + fMode = MODE_32BIT; + return; + } else if (FB[X86::Mode64Bit]) { + fMode = MODE_64BIT; + return; + } + + llvm_unreachable("Invalid CPU mode"); +} + +namespace { +struct Region { + ArrayRef<uint8_t> Bytes; + uint64_t Base; + Region(ArrayRef<uint8_t> Bytes, uint64_t Base) : Bytes(Bytes), Base(Base) {} +}; +} // end anonymous namespace + +/// A callback function that wraps the readByte method from Region. +/// +/// @param Arg - The generic callback parameter. In this case, this should +/// be a pointer to a Region. +/// @param Byte - A pointer to the byte to be read. +/// @param Address - The address to be read. +static int regionReader(const void *Arg, uint8_t *Byte, uint64_t Address) { + auto *R = static_cast<const Region *>(Arg); + ArrayRef<uint8_t> Bytes = R->Bytes; + unsigned Index = Address - R->Base; + if (Bytes.size() <= Index) + return -1; + *Byte = Bytes[Index]; + return 0; +} + +/// logger - a callback function that wraps the operator<< method from +/// raw_ostream. +/// +/// @param arg - The generic callback parameter. This should be a pointe +/// to a raw_ostream. +/// @param log - A string to be logged. logger() adds a newline. +static void logger(void* arg, const char* log) { + if (!arg) + return; + + raw_ostream &vStream = *(static_cast<raw_ostream*>(arg)); + vStream << log << "\n"; +} + +// +// Public interface for the disassembler +// + +MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction( + MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, + raw_ostream &VStream, raw_ostream &CStream) const { + CommentStream = &CStream; + + InternalInstruction InternalInstr; + + dlog_t LoggerFn = logger; + if (&VStream == &nulls()) + LoggerFn = nullptr; // Disable logging completely if it's going to nulls(). + + Region R(Bytes, Address); + + int Ret = decodeInstruction(&InternalInstr, regionReader, (const void *)&R, + LoggerFn, (void *)&VStream, + (const void *)MII.get(), Address, fMode); + + if (Ret) { + Size = InternalInstr.readerCursor - Address; + return Fail; + } else { + Size = InternalInstr.length; + return (!translateInstruction(Instr, InternalInstr, this)) ? Success : Fail; + } +} + +// +// Private code that translates from struct InternalInstructions to MCInsts. +// + +/// translateRegister - Translates an internal register to the appropriate LLVM +/// register, and appends it as an operand to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param reg - The Reg to append. +static void translateRegister(MCInst &mcInst, Reg reg) { +#define ENTRY(x) X86::x, + uint8_t llvmRegnums[] = { + ALL_REGS + 0 + }; +#undef ENTRY + + uint8_t llvmRegnum = llvmRegnums[reg]; + mcInst.addOperand(MCOperand::createReg(llvmRegnum)); +} + +/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the +/// immediate Value in the MCInst. +/// +/// @param Value - The immediate Value, has had any PC adjustment made by +/// the caller. +/// @param isBranch - If the instruction is a branch instruction +/// @param Address - The starting address of the instruction +/// @param Offset - The byte offset to this immediate in the instruction +/// @param Width - The byte width of this immediate in the instruction +/// +/// If the getOpInfo() function was set when setupForSymbolicDisassembly() was +/// called then that function is called to get any symbolic information for the +/// immediate in the instruction using the Address, Offset and Width. If that +/// returns non-zero then the symbolic information it returns is used to create +/// an MCExpr and that is added as an operand to the MCInst. If getOpInfo() +/// returns zero and isBranch is true then a symbol look up for immediate Value +/// is done and if a symbol is found an MCExpr is created with that, else +/// an MCExpr with the immediate Value is created. This function returns true +/// if it adds an operand to the MCInst and false otherwise. +static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch, + uint64_t Address, uint64_t Offset, + uint64_t Width, MCInst &MI, + const MCDisassembler *Dis) { + return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch, + Offset, Width); +} + +/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being +/// referenced by a load instruction with the base register that is the rip. +/// These can often be addresses in a literal pool. The Address of the +/// instruction and its immediate Value are used to determine the address +/// being referenced in the literal pool entry. The SymbolLookUp call back will +/// return a pointer to a literal 'C' string if the referenced address is an +/// address into a section with 'C' string literals. +static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value, + const void *Decoder) { + const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder); + Dis->tryAddingPcLoadReferenceComment(Value, Address); +} + +static const uint8_t segmentRegnums[SEG_OVERRIDE_max] = { + 0, // SEG_OVERRIDE_NONE + X86::CS, + X86::SS, + X86::DS, + X86::ES, + X86::FS, + X86::GS +}; + +/// translateSrcIndex - Appends a source index operand to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param insn - The internal instruction. +static bool translateSrcIndex(MCInst &mcInst, InternalInstruction &insn) { + unsigned baseRegNo; + + if (insn.mode == MODE_64BIT) + baseRegNo = insn.prefixPresent[0x67] ? X86::ESI : X86::RSI; + else if (insn.mode == MODE_32BIT) + baseRegNo = insn.prefixPresent[0x67] ? X86::SI : X86::ESI; + else { + assert(insn.mode == MODE_16BIT); + baseRegNo = insn.prefixPresent[0x67] ? X86::ESI : X86::SI; + } + MCOperand baseReg = MCOperand::createReg(baseRegNo); + mcInst.addOperand(baseReg); + + MCOperand segmentReg; + segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]); + mcInst.addOperand(segmentReg); + return false; +} + +/// translateDstIndex - Appends a destination index operand to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param insn - The internal instruction. + +static bool translateDstIndex(MCInst &mcInst, InternalInstruction &insn) { + unsigned baseRegNo; + + if (insn.mode == MODE_64BIT) + baseRegNo = insn.prefixPresent[0x67] ? X86::EDI : X86::RDI; + else if (insn.mode == MODE_32BIT) + baseRegNo = insn.prefixPresent[0x67] ? X86::DI : X86::EDI; + else { + assert(insn.mode == MODE_16BIT); + baseRegNo = insn.prefixPresent[0x67] ? X86::EDI : X86::DI; + } + MCOperand baseReg = MCOperand::createReg(baseRegNo); + mcInst.addOperand(baseReg); + return false; +} + +/// translateImmediate - Appends an immediate operand to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param immediate - The immediate value to append. +/// @param operand - The operand, as stored in the descriptor table. +/// @param insn - The internal instruction. +static void translateImmediate(MCInst &mcInst, uint64_t immediate, + const OperandSpecifier &operand, + InternalInstruction &insn, + const MCDisassembler *Dis) { + // Sign-extend the immediate if necessary. + + OperandType type = (OperandType)operand.type; + + bool isBranch = false; + uint64_t pcrel = 0; + if (type == TYPE_RELv) { + isBranch = true; + pcrel = insn.startLocation + + insn.immediateOffset + insn.immediateSize; + switch (insn.displacementSize) { + default: + break; + case 1: + if(immediate & 0x80) + immediate |= ~(0xffull); + break; + case 2: + if(immediate & 0x8000) + immediate |= ~(0xffffull); + break; + case 4: + if(immediate & 0x80000000) + immediate |= ~(0xffffffffull); + break; + case 8: + break; + } + } + // By default sign-extend all X86 immediates based on their encoding. + else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 || + type == TYPE_IMM64 || type == TYPE_IMMv) { + switch (operand.encoding) { + default: + break; + case ENCODING_IB: + if(immediate & 0x80) + immediate |= ~(0xffull); + break; + case ENCODING_IW: + if(immediate & 0x8000) + immediate |= ~(0xffffull); + break; + case ENCODING_ID: + if(immediate & 0x80000000) + immediate |= ~(0xffffffffull); + break; + case ENCODING_IO: + break; + } + } else if (type == TYPE_IMM3) { + // Check for immediates that printSSECC can't handle. + if (immediate >= 8) { + unsigned NewOpc; + switch (mcInst.getOpcode()) { + default: llvm_unreachable("unexpected opcode"); + case X86::CMPPDrmi: NewOpc = X86::CMPPDrmi_alt; break; + case X86::CMPPDrri: NewOpc = X86::CMPPDrri_alt; break; + case X86::CMPPSrmi: NewOpc = X86::CMPPSrmi_alt; break; + case X86::CMPPSrri: NewOpc = X86::CMPPSrri_alt; break; + case X86::CMPSDrm: NewOpc = X86::CMPSDrm_alt; break; + case X86::CMPSDrr: NewOpc = X86::CMPSDrr_alt; break; + case X86::CMPSSrm: NewOpc = X86::CMPSSrm_alt; break; + case X86::CMPSSrr: NewOpc = X86::CMPSSrr_alt; break; + case X86::VPCOMBri: NewOpc = X86::VPCOMBri_alt; break; + case X86::VPCOMBmi: NewOpc = X86::VPCOMBmi_alt; break; + case X86::VPCOMWri: NewOpc = X86::VPCOMWri_alt; break; + case X86::VPCOMWmi: NewOpc = X86::VPCOMWmi_alt; break; + case X86::VPCOMDri: NewOpc = X86::VPCOMDri_alt; break; + case X86::VPCOMDmi: NewOpc = X86::VPCOMDmi_alt; break; + case X86::VPCOMQri: NewOpc = X86::VPCOMQri_alt; break; + case X86::VPCOMQmi: NewOpc = X86::VPCOMQmi_alt; break; + case X86::VPCOMUBri: NewOpc = X86::VPCOMUBri_alt; break; + case X86::VPCOMUBmi: NewOpc = X86::VPCOMUBmi_alt; break; + case X86::VPCOMUWri: NewOpc = X86::VPCOMUWri_alt; break; + case X86::VPCOMUWmi: NewOpc = X86::VPCOMUWmi_alt; break; + case X86::VPCOMUDri: NewOpc = X86::VPCOMUDri_alt; break; + case X86::VPCOMUDmi: NewOpc = X86::VPCOMUDmi_alt; break; + case X86::VPCOMUQri: NewOpc = X86::VPCOMUQri_alt; break; + case X86::VPCOMUQmi: NewOpc = X86::VPCOMUQmi_alt; break; + } + // Switch opcode to the one that doesn't get special printing. + mcInst.setOpcode(NewOpc); + } + } else if (type == TYPE_IMM5) { + // Check for immediates that printAVXCC can't handle. + if (immediate >= 32) { + unsigned NewOpc; + switch (mcInst.getOpcode()) { + default: llvm_unreachable("unexpected opcode"); + case X86::VCMPPDrmi: NewOpc = X86::VCMPPDrmi_alt; break; + case X86::VCMPPDrri: NewOpc = X86::VCMPPDrri_alt; break; + case X86::VCMPPSrmi: NewOpc = X86::VCMPPSrmi_alt; break; + case X86::VCMPPSrri: NewOpc = X86::VCMPPSrri_alt; break; + case X86::VCMPSDrm: NewOpc = X86::VCMPSDrm_alt; break; + case X86::VCMPSDrr: NewOpc = X86::VCMPSDrr_alt; break; + case X86::VCMPSSrm: NewOpc = X86::VCMPSSrm_alt; break; + case X86::VCMPSSrr: NewOpc = X86::VCMPSSrr_alt; break; + case X86::VCMPPDYrmi: NewOpc = X86::VCMPPDYrmi_alt; break; + case X86::VCMPPDYrri: NewOpc = X86::VCMPPDYrri_alt; break; + case X86::VCMPPSYrmi: NewOpc = X86::VCMPPSYrmi_alt; break; + case X86::VCMPPSYrri: NewOpc = X86::VCMPPSYrri_alt; break; + case X86::VCMPPDZrmi: NewOpc = X86::VCMPPDZrmi_alt; break; + case X86::VCMPPDZrri: NewOpc = X86::VCMPPDZrri_alt; break; + case X86::VCMPPDZrrib: NewOpc = X86::VCMPPDZrrib_alt; break; + case X86::VCMPPSZrmi: NewOpc = X86::VCMPPSZrmi_alt; break; + case X86::VCMPPSZrri: NewOpc = X86::VCMPPSZrri_alt; break; + case X86::VCMPPSZrrib: NewOpc = X86::VCMPPSZrrib_alt; break; + case X86::VCMPSDZrm: NewOpc = X86::VCMPSDZrmi_alt; break; + case X86::VCMPSDZrr: NewOpc = X86::VCMPSDZrri_alt; break; + case X86::VCMPSSZrm: NewOpc = X86::VCMPSSZrmi_alt; break; + case X86::VCMPSSZrr: NewOpc = X86::VCMPSSZrri_alt; break; + } + // Switch opcode to the one that doesn't get special printing. + mcInst.setOpcode(NewOpc); + } + } else if (type == TYPE_AVX512ICC) { + if (immediate >= 8 || ((immediate & 0x3) == 3)) { + unsigned NewOpc; + switch (mcInst.getOpcode()) { + default: llvm_unreachable("unexpected opcode"); + case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPBZ128rmi_alt; break; + case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPBZ128rmik_alt; break; + case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPBZ128rri_alt; break; + case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPBZ128rrik_alt; break; + case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPBZ256rmi_alt; break; + case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPBZ256rmik_alt; break; + case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPBZ256rri_alt; break; + case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPBZ256rrik_alt; break; + case X86::VPCMPBZrmi: NewOpc = X86::VPCMPBZrmi_alt; break; + case X86::VPCMPBZrmik: NewOpc = X86::VPCMPBZrmik_alt; break; + case X86::VPCMPBZrri: NewOpc = X86::VPCMPBZrri_alt; break; + case X86::VPCMPBZrrik: NewOpc = X86::VPCMPBZrrik_alt; break; + case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPDZ128rmi_alt; break; + case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPDZ128rmib_alt; break; + case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPDZ128rmibk_alt; break; + case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPDZ128rmik_alt; break; + case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPDZ128rri_alt; break; + case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPDZ128rrik_alt; break; + case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPDZ256rmi_alt; break; + case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPDZ256rmib_alt; break; + case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPDZ256rmibk_alt; break; + case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPDZ256rmik_alt; break; + case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPDZ256rri_alt; break; + case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPDZ256rrik_alt; break; + case X86::VPCMPDZrmi: NewOpc = X86::VPCMPDZrmi_alt; break; + case X86::VPCMPDZrmib: NewOpc = X86::VPCMPDZrmib_alt; break; + case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPDZrmibk_alt; break; + case X86::VPCMPDZrmik: NewOpc = X86::VPCMPDZrmik_alt; break; + case X86::VPCMPDZrri: NewOpc = X86::VPCMPDZrri_alt; break; + case X86::VPCMPDZrrik: NewOpc = X86::VPCMPDZrrik_alt; break; + case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPQZ128rmi_alt; break; + case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPQZ128rmib_alt; break; + case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPQZ128rmibk_alt; break; + case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPQZ128rmik_alt; break; + case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPQZ128rri_alt; break; + case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPQZ128rrik_alt; break; + case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPQZ256rmi_alt; break; + case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPQZ256rmib_alt; break; + case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPQZ256rmibk_alt; break; + case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPQZ256rmik_alt; break; + case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPQZ256rri_alt; break; + case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPQZ256rrik_alt; break; + case X86::VPCMPQZrmi: NewOpc = X86::VPCMPQZrmi_alt; break; + case X86::VPCMPQZrmib: NewOpc = X86::VPCMPQZrmib_alt; break; + case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPQZrmibk_alt; break; + case X86::VPCMPQZrmik: NewOpc = X86::VPCMPQZrmik_alt; break; + case X86::VPCMPQZrri: NewOpc = X86::VPCMPQZrri_alt; break; + case X86::VPCMPQZrrik: NewOpc = X86::VPCMPQZrrik_alt; break; + case X86::VPCMPUBZ128rmi: NewOpc = X86::VPCMPUBZ128rmi_alt; break; + case X86::VPCMPUBZ128rmik: NewOpc = X86::VPCMPUBZ128rmik_alt; break; + case X86::VPCMPUBZ128rri: NewOpc = X86::VPCMPUBZ128rri_alt; break; + case X86::VPCMPUBZ128rrik: NewOpc = X86::VPCMPUBZ128rrik_alt; break; + case X86::VPCMPUBZ256rmi: NewOpc = X86::VPCMPUBZ256rmi_alt; break; + case X86::VPCMPUBZ256rmik: NewOpc = X86::VPCMPUBZ256rmik_alt; break; + case X86::VPCMPUBZ256rri: NewOpc = X86::VPCMPUBZ256rri_alt; break; + case X86::VPCMPUBZ256rrik: NewOpc = X86::VPCMPUBZ256rrik_alt; break; + case X86::VPCMPUBZrmi: NewOpc = X86::VPCMPUBZrmi_alt; break; + case X86::VPCMPUBZrmik: NewOpc = X86::VPCMPUBZrmik_alt; break; + case X86::VPCMPUBZrri: NewOpc = X86::VPCMPUBZrri_alt; break; + case X86::VPCMPUBZrrik: NewOpc = X86::VPCMPUBZrrik_alt; break; + case X86::VPCMPUDZ128rmi: NewOpc = X86::VPCMPUDZ128rmi_alt; break; + case X86::VPCMPUDZ128rmib: NewOpc = X86::VPCMPUDZ128rmib_alt; break; + case X86::VPCMPUDZ128rmibk: NewOpc = X86::VPCMPUDZ128rmibk_alt; break; + case X86::VPCMPUDZ128rmik: NewOpc = X86::VPCMPUDZ128rmik_alt; break; + case X86::VPCMPUDZ128rri: NewOpc = X86::VPCMPUDZ128rri_alt; break; + case X86::VPCMPUDZ128rrik: NewOpc = X86::VPCMPUDZ128rrik_alt; break; + case X86::VPCMPUDZ256rmi: NewOpc = X86::VPCMPUDZ256rmi_alt; break; + case X86::VPCMPUDZ256rmib: NewOpc = X86::VPCMPUDZ256rmib_alt; break; + case X86::VPCMPUDZ256rmibk: NewOpc = X86::VPCMPUDZ256rmibk_alt; break; + case X86::VPCMPUDZ256rmik: NewOpc = X86::VPCMPUDZ256rmik_alt; break; + case X86::VPCMPUDZ256rri: NewOpc = X86::VPCMPUDZ256rri_alt; break; + case X86::VPCMPUDZ256rrik: NewOpc = X86::VPCMPUDZ256rrik_alt; break; + case X86::VPCMPUDZrmi: NewOpc = X86::VPCMPUDZrmi_alt; break; + case X86::VPCMPUDZrmib: NewOpc = X86::VPCMPUDZrmib_alt; break; + case X86::VPCMPUDZrmibk: NewOpc = X86::VPCMPUDZrmibk_alt; break; + case X86::VPCMPUDZrmik: NewOpc = X86::VPCMPUDZrmik_alt; break; + case X86::VPCMPUDZrri: NewOpc = X86::VPCMPUDZrri_alt; break; + case X86::VPCMPUDZrrik: NewOpc = X86::VPCMPUDZrrik_alt; break; + case X86::VPCMPUQZ128rmi: NewOpc = X86::VPCMPUQZ128rmi_alt; break; + case X86::VPCMPUQZ128rmib: NewOpc = X86::VPCMPUQZ128rmib_alt; break; + case X86::VPCMPUQZ128rmibk: NewOpc = X86::VPCMPUQZ128rmibk_alt; break; + case X86::VPCMPUQZ128rmik: NewOpc = X86::VPCMPUQZ128rmik_alt; break; + case X86::VPCMPUQZ128rri: NewOpc = X86::VPCMPUQZ128rri_alt; break; + case X86::VPCMPUQZ128rrik: NewOpc = X86::VPCMPUQZ128rrik_alt; break; + case X86::VPCMPUQZ256rmi: NewOpc = X86::VPCMPUQZ256rmi_alt; break; + case X86::VPCMPUQZ256rmib: NewOpc = X86::VPCMPUQZ256rmib_alt; break; + case X86::VPCMPUQZ256rmibk: NewOpc = X86::VPCMPUQZ256rmibk_alt; break; + case X86::VPCMPUQZ256rmik: NewOpc = X86::VPCMPUQZ256rmik_alt; break; + case X86::VPCMPUQZ256rri: NewOpc = X86::VPCMPUQZ256rri_alt; break; + case X86::VPCMPUQZ256rrik: NewOpc = X86::VPCMPUQZ256rrik_alt; break; + case X86::VPCMPUQZrmi: NewOpc = X86::VPCMPUQZrmi_alt; break; + case X86::VPCMPUQZrmib: NewOpc = X86::VPCMPUQZrmib_alt; break; + case X86::VPCMPUQZrmibk: NewOpc = X86::VPCMPUQZrmibk_alt; break; + case X86::VPCMPUQZrmik: NewOpc = X86::VPCMPUQZrmik_alt; break; + case X86::VPCMPUQZrri: NewOpc = X86::VPCMPUQZrri_alt; break; + case X86::VPCMPUQZrrik: NewOpc = X86::VPCMPUQZrrik_alt; break; + case X86::VPCMPUWZ128rmi: NewOpc = X86::VPCMPUWZ128rmi_alt; break; + case X86::VPCMPUWZ128rmik: NewOpc = X86::VPCMPUWZ128rmik_alt; break; + case X86::VPCMPUWZ128rri: NewOpc = X86::VPCMPUWZ128rri_alt; break; + case X86::VPCMPUWZ128rrik: NewOpc = X86::VPCMPUWZ128rrik_alt; break; + case X86::VPCMPUWZ256rmi: NewOpc = X86::VPCMPUWZ256rmi_alt; break; + case X86::VPCMPUWZ256rmik: NewOpc = X86::VPCMPUWZ256rmik_alt; break; + case X86::VPCMPUWZ256rri: NewOpc = X86::VPCMPUWZ256rri_alt; break; + case X86::VPCMPUWZ256rrik: NewOpc = X86::VPCMPUWZ256rrik_alt; break; + case X86::VPCMPUWZrmi: NewOpc = X86::VPCMPUWZrmi_alt; break; + case X86::VPCMPUWZrmik: NewOpc = X86::VPCMPUWZrmik_alt; break; + case X86::VPCMPUWZrri: NewOpc = X86::VPCMPUWZrri_alt; break; + case X86::VPCMPUWZrrik: NewOpc = X86::VPCMPUWZrrik_alt; break; + case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPWZ128rmi_alt; break; + case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPWZ128rmik_alt; break; + case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPWZ128rri_alt; break; + case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPWZ128rrik_alt; break; + case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPWZ256rmi_alt; break; + case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPWZ256rmik_alt; break; + case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPWZ256rri_alt; break; + case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPWZ256rrik_alt; break; + case X86::VPCMPWZrmi: NewOpc = X86::VPCMPWZrmi_alt; break; + case X86::VPCMPWZrmik: NewOpc = X86::VPCMPWZrmik_alt; break; + case X86::VPCMPWZrri: NewOpc = X86::VPCMPWZrri_alt; break; + case X86::VPCMPWZrrik: NewOpc = X86::VPCMPWZrrik_alt; break; + } + // Switch opcode to the one that doesn't get special printing. + mcInst.setOpcode(NewOpc); + } + } + + switch (type) { + case TYPE_XMM32: + case TYPE_XMM64: + case TYPE_XMM128: + mcInst.addOperand(MCOperand::createReg(X86::XMM0 + (immediate >> 4))); + return; + case TYPE_XMM256: + mcInst.addOperand(MCOperand::createReg(X86::YMM0 + (immediate >> 4))); + return; + case TYPE_XMM512: + mcInst.addOperand(MCOperand::createReg(X86::ZMM0 + (immediate >> 4))); + return; + case TYPE_BNDR: + mcInst.addOperand(MCOperand::createReg(X86::BND0 + (immediate >> 4))); + case TYPE_REL8: + isBranch = true; + pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize; + if (immediate & 0x80) + immediate |= ~(0xffull); + break; + case TYPE_REL16: + isBranch = true; + pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize; + if (immediate & 0x8000) + immediate |= ~(0xffffull); + break; + case TYPE_REL32: + case TYPE_REL64: + isBranch = true; + pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize; + if(immediate & 0x80000000) + immediate |= ~(0xffffffffull); + break; + default: + // operand is 64 bits wide. Do nothing. + break; + } + + if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation, + insn.immediateOffset, insn.immediateSize, + mcInst, Dis)) + mcInst.addOperand(MCOperand::createImm(immediate)); + + if (type == TYPE_MOFFS8 || type == TYPE_MOFFS16 || + type == TYPE_MOFFS32 || type == TYPE_MOFFS64) { + MCOperand segmentReg; + segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]); + mcInst.addOperand(segmentReg); + } +} + +/// translateRMRegister - Translates a register stored in the R/M field of the +/// ModR/M byte to its LLVM equivalent and appends it to an MCInst. +/// @param mcInst - The MCInst to append to. +/// @param insn - The internal instruction to extract the R/M field +/// from. +/// @return - 0 on success; -1 otherwise +static bool translateRMRegister(MCInst &mcInst, + InternalInstruction &insn) { + if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) { + debug("A R/M register operand may not have a SIB byte"); + return true; + } + + switch (insn.eaBase) { + default: + debug("Unexpected EA base register"); + return true; + case EA_BASE_NONE: + debug("EA_BASE_NONE for ModR/M base"); + return true; +#define ENTRY(x) case EA_BASE_##x: + ALL_EA_BASES +#undef ENTRY + debug("A R/M register operand may not have a base; " + "the operand must be a register."); + return true; +#define ENTRY(x) \ + case EA_REG_##x: \ + mcInst.addOperand(MCOperand::createReg(X86::x)); break; + ALL_REGS +#undef ENTRY + } + + return false; +} + +/// translateRMMemory - Translates a memory operand stored in the Mod and R/M +/// fields of an internal instruction (and possibly its SIB byte) to a memory +/// operand in LLVM's format, and appends it to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param insn - The instruction to extract Mod, R/M, and SIB fields +/// from. +/// @return - 0 on success; nonzero otherwise +static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, + const MCDisassembler *Dis) { + // Addresses in an MCInst are represented as five operands: + // 1. basereg (register) The R/M base, or (if there is a SIB) the + // SIB base + // 2. scaleamount (immediate) 1, or (if there is a SIB) the specified + // scale amount + // 3. indexreg (register) x86_registerNONE, or (if there is a SIB) + // the index (which is multiplied by the + // scale amount) + // 4. displacement (immediate) 0, or the displacement if there is one + // 5. segmentreg (register) x86_registerNONE for now, but could be set + // if we have segment overrides + + MCOperand baseReg; + MCOperand scaleAmount; + MCOperand indexReg; + MCOperand displacement; + MCOperand segmentReg; + uint64_t pcrel = 0; + + if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) { + if (insn.sibBase != SIB_BASE_NONE) { + switch (insn.sibBase) { + default: + debug("Unexpected sibBase"); + return true; +#define ENTRY(x) \ + case SIB_BASE_##x: \ + baseReg = MCOperand::createReg(X86::x); break; + ALL_SIB_BASES +#undef ENTRY + } + } else { + baseReg = MCOperand::createReg(0); + } + + // Check whether we are handling VSIB addressing mode for GATHER. + // If sibIndex was set to SIB_INDEX_NONE, index offset is 4 and + // we should use SIB_INDEX_XMM4|YMM4 for VSIB. + // I don't see a way to get the correct IndexReg in readSIB: + // We can tell whether it is VSIB or SIB after instruction ID is decoded, + // but instruction ID may not be decoded yet when calling readSIB. + uint32_t Opcode = mcInst.getOpcode(); + bool IndexIs128 = (Opcode == X86::VGATHERDPDrm || + Opcode == X86::VGATHERDPDYrm || + Opcode == X86::VGATHERQPDrm || + Opcode == X86::VGATHERDPSrm || + Opcode == X86::VGATHERQPSrm || + Opcode == X86::VPGATHERDQrm || + Opcode == X86::VPGATHERDQYrm || + Opcode == X86::VPGATHERQQrm || + Opcode == X86::VPGATHERDDrm || + Opcode == X86::VPGATHERQDrm); + bool IndexIs256 = (Opcode == X86::VGATHERQPDYrm || + Opcode == X86::VGATHERDPSYrm || + Opcode == X86::VGATHERQPSYrm || + Opcode == X86::VGATHERDPDZrm || + Opcode == X86::VPGATHERDQZrm || + Opcode == X86::VPGATHERQQYrm || + Opcode == X86::VPGATHERDDYrm || + Opcode == X86::VPGATHERQDYrm); + bool IndexIs512 = (Opcode == X86::VGATHERQPDZrm || + Opcode == X86::VGATHERDPSZrm || + Opcode == X86::VGATHERQPSZrm || + Opcode == X86::VPGATHERQQZrm || + Opcode == X86::VPGATHERDDZrm || + Opcode == X86::VPGATHERQDZrm); + if (IndexIs128 || IndexIs256 || IndexIs512) { + unsigned IndexOffset = insn.sibIndex - + (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX); + SIBIndex IndexBase = IndexIs512 ? SIB_INDEX_ZMM0 : + IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0; + insn.sibIndex = (SIBIndex)(IndexBase + + (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset)); + } + + if (insn.sibIndex != SIB_INDEX_NONE) { + switch (insn.sibIndex) { + default: + debug("Unexpected sibIndex"); + return true; +#define ENTRY(x) \ + case SIB_INDEX_##x: \ + indexReg = MCOperand::createReg(X86::x); break; + EA_BASES_32BIT + EA_BASES_64BIT + REGS_XMM + REGS_YMM + REGS_ZMM +#undef ENTRY + } + } else { + indexReg = MCOperand::createReg(0); + } + + scaleAmount = MCOperand::createImm(insn.sibScale); + } else { + switch (insn.eaBase) { + case EA_BASE_NONE: + if (insn.eaDisplacement == EA_DISP_NONE) { + debug("EA_BASE_NONE and EA_DISP_NONE for ModR/M base"); + return true; + } + if (insn.mode == MODE_64BIT){ + pcrel = insn.startLocation + + insn.displacementOffset + insn.displacementSize; + tryAddingPcLoadReferenceComment(insn.startLocation + + insn.displacementOffset, + insn.displacement + pcrel, Dis); + baseReg = MCOperand::createReg(X86::RIP); // Section 2.2.1.6 + } + else + baseReg = MCOperand::createReg(0); + + indexReg = MCOperand::createReg(0); + break; + case EA_BASE_BX_SI: + baseReg = MCOperand::createReg(X86::BX); + indexReg = MCOperand::createReg(X86::SI); + break; + case EA_BASE_BX_DI: + baseReg = MCOperand::createReg(X86::BX); + indexReg = MCOperand::createReg(X86::DI); + break; + case EA_BASE_BP_SI: + baseReg = MCOperand::createReg(X86::BP); + indexReg = MCOperand::createReg(X86::SI); + break; + case EA_BASE_BP_DI: + baseReg = MCOperand::createReg(X86::BP); + indexReg = MCOperand::createReg(X86::DI); + break; + default: + indexReg = MCOperand::createReg(0); + switch (insn.eaBase) { + default: + debug("Unexpected eaBase"); + return true; + // Here, we will use the fill-ins defined above. However, + // BX_SI, BX_DI, BP_SI, and BP_DI are all handled above and + // sib and sib64 were handled in the top-level if, so they're only + // placeholders to keep the compiler happy. +#define ENTRY(x) \ + case EA_BASE_##x: \ + baseReg = MCOperand::createReg(X86::x); break; + ALL_EA_BASES +#undef ENTRY +#define ENTRY(x) case EA_REG_##x: + ALL_REGS +#undef ENTRY + debug("A R/M memory operand may not be a register; " + "the base field must be a base."); + return true; + } + } + + scaleAmount = MCOperand::createImm(1); + } + + displacement = MCOperand::createImm(insn.displacement); + + segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]); + + mcInst.addOperand(baseReg); + mcInst.addOperand(scaleAmount); + mcInst.addOperand(indexReg); + if(!tryAddingSymbolicOperand(insn.displacement + pcrel, false, + insn.startLocation, insn.displacementOffset, + insn.displacementSize, mcInst, Dis)) + mcInst.addOperand(displacement); + mcInst.addOperand(segmentReg); + return false; +} + +/// translateRM - Translates an operand stored in the R/M (and possibly SIB) +/// byte of an instruction to LLVM form, and appends it to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param operand - The operand, as stored in the descriptor table. +/// @param insn - The instruction to extract Mod, R/M, and SIB fields +/// from. +/// @return - 0 on success; nonzero otherwise +static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, + InternalInstruction &insn, const MCDisassembler *Dis) { + switch (operand.type) { + default: + debug("Unexpected type for a R/M operand"); + return true; + case TYPE_R8: + case TYPE_R16: + case TYPE_R32: + case TYPE_R64: + case TYPE_Rv: + case TYPE_MM64: + case TYPE_XMM: + case TYPE_XMM32: + case TYPE_XMM64: + case TYPE_XMM128: + case TYPE_XMM256: + case TYPE_XMM512: + case TYPE_VK1: + case TYPE_VK2: + case TYPE_VK4: + case TYPE_VK8: + case TYPE_VK16: + case TYPE_VK32: + case TYPE_VK64: + case TYPE_DEBUGREG: + case TYPE_CONTROLREG: + case TYPE_BNDR: + return translateRMRegister(mcInst, insn); + case TYPE_M: + case TYPE_M8: + case TYPE_M16: + case TYPE_M32: + case TYPE_M64: + case TYPE_M128: + case TYPE_M256: + case TYPE_M512: + case TYPE_Mv: + case TYPE_M32FP: + case TYPE_M64FP: + case TYPE_M80FP: + case TYPE_M1616: + case TYPE_M1632: + case TYPE_M1664: + case TYPE_LEA: + return translateRMMemory(mcInst, insn, Dis); + } +} + +/// translateFPRegister - Translates a stack position on the FPU stack to its +/// LLVM form, and appends it to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param stackPos - The stack position to translate. +static void translateFPRegister(MCInst &mcInst, + uint8_t stackPos) { + mcInst.addOperand(MCOperand::createReg(X86::ST0 + stackPos)); +} + +/// translateMaskRegister - Translates a 3-bit mask register number to +/// LLVM form, and appends it to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param maskRegNum - Number of mask register from 0 to 7. +/// @return - false on success; true otherwise. +static bool translateMaskRegister(MCInst &mcInst, + uint8_t maskRegNum) { + if (maskRegNum >= 8) { + debug("Invalid mask register number"); + return true; + } + + mcInst.addOperand(MCOperand::createReg(X86::K0 + maskRegNum)); + return false; +} + +/// translateOperand - Translates an operand stored in an internal instruction +/// to LLVM's format and appends it to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param operand - The operand, as stored in the descriptor table. +/// @param insn - The internal instruction. +/// @return - false on success; true otherwise. +static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, + InternalInstruction &insn, + const MCDisassembler *Dis) { + switch (operand.encoding) { + default: + debug("Unhandled operand encoding during translation"); + return true; + case ENCODING_REG: + translateRegister(mcInst, insn.reg); + return false; + case ENCODING_WRITEMASK: + return translateMaskRegister(mcInst, insn.writemask); + CASE_ENCODING_RM: + return translateRM(mcInst, operand, insn, Dis); + case ENCODING_CB: + case ENCODING_CW: + case ENCODING_CD: + case ENCODING_CP: + case ENCODING_CO: + case ENCODING_CT: + debug("Translation of code offsets isn't supported."); + return true; + case ENCODING_IB: + case ENCODING_IW: + case ENCODING_ID: + case ENCODING_IO: + case ENCODING_Iv: + case ENCODING_Ia: + translateImmediate(mcInst, + insn.immediates[insn.numImmediatesTranslated++], + operand, + insn, + Dis); + return false; + case ENCODING_SI: + return translateSrcIndex(mcInst, insn); + case ENCODING_DI: + return translateDstIndex(mcInst, insn); + case ENCODING_RB: + case ENCODING_RW: + case ENCODING_RD: + case ENCODING_RO: + case ENCODING_Rv: + translateRegister(mcInst, insn.opcodeRegister); + return false; + case ENCODING_FP: + translateFPRegister(mcInst, insn.modRM & 7); + return false; + case ENCODING_VVVV: + translateRegister(mcInst, insn.vvvv); + return false; + case ENCODING_DUP: + return translateOperand(mcInst, insn.operands[operand.type - TYPE_DUP0], + insn, Dis); + } +} + +/// translateInstruction - Translates an internal instruction and all its +/// operands to an MCInst. +/// +/// @param mcInst - The MCInst to populate with the instruction's data. +/// @param insn - The internal instruction. +/// @return - false on success; true otherwise. +static bool translateInstruction(MCInst &mcInst, + InternalInstruction &insn, + const MCDisassembler *Dis) { + if (!insn.spec) { + debug("Instruction has no specification"); + return true; + } + + mcInst.clear(); + mcInst.setOpcode(insn.instructionID); + // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3 + // prefix bytes should be disassembled as xrelease and xacquire then set the + // opcode to those instead of the rep and repne opcodes. + if (insn.xAcquireRelease) { + if(mcInst.getOpcode() == X86::REP_PREFIX) + mcInst.setOpcode(X86::XRELEASE_PREFIX); + else if(mcInst.getOpcode() == X86::REPNE_PREFIX) + mcInst.setOpcode(X86::XACQUIRE_PREFIX); + } + + insn.numImmediatesTranslated = 0; + + for (const auto &Op : insn.operands) { + if (Op.encoding != ENCODING_NONE) { + if (translateOperand(mcInst, Op, insn, Dis)) { + return true; + } + } + } + + return false; +} + +static MCDisassembler *createX86Disassembler(const Target &T, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + std::unique_ptr<const MCInstrInfo> MII(T.createMCInstrInfo()); + return new X86Disassembler::X86GenericDisassembler(STI, Ctx, std::move(MII)); +} + +extern "C" void LLVMInitializeX86Disassembler() { + // Register the disassembler. + TargetRegistry::RegisterMCDisassembler(TheX86_32Target, + createX86Disassembler); + TargetRegistry::RegisterMCDisassembler(TheX86_64Target, + createX86Disassembler); +} diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h new file mode 100644 index 0000000..d7f426b --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h @@ -0,0 +1,112 @@ +//===-- X86Disassembler.h - Disassembler for x86 and x86_64 -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The X86 disassembler is a table-driven disassembler for the 16-, 32-, and +// 64-bit X86 instruction sets. The main decode sequence for an assembly +// instruction in this disassembler is: +// +// 1. Read the prefix bytes and determine the attributes of the instruction. +// These attributes, recorded in enum attributeBits +// (X86DisassemblerDecoderCommon.h), form a bitmask. The table CONTEXTS_SYM +// provides a mapping from bitmasks to contexts, which are represented by +// enum InstructionContext (ibid.). +// +// 2. Read the opcode, and determine what kind of opcode it is. The +// disassembler distinguishes four kinds of opcodes, which are enumerated in +// OpcodeType (X86DisassemblerDecoderCommon.h): one-byte (0xnn), two-byte +// (0x0f 0xnn), three-byte-38 (0x0f 0x38 0xnn), or three-byte-3a +// (0x0f 0x3a 0xnn). Mandatory prefixes are treated as part of the context. +// +// 3. Depending on the opcode type, look in one of four ClassDecision structures +// (X86DisassemblerDecoderCommon.h). Use the opcode class to determine which +// OpcodeDecision (ibid.) to look the opcode in. Look up the opcode, to get +// a ModRMDecision (ibid.). +// +// 4. Some instructions, such as escape opcodes or extended opcodes, or even +// instructions that have ModRM*Reg / ModRM*Mem forms in LLVM, need the +// ModR/M byte to complete decode. The ModRMDecision's type is an entry from +// ModRMDecisionType (X86DisassemblerDecoderCommon.h) that indicates if the +// ModR/M byte is required and how to interpret it. +// +// 5. After resolving the ModRMDecision, the disassembler has a unique ID +// of type InstrUID (X86DisassemblerDecoderCommon.h). Looking this ID up in +// INSTRUCTIONS_SYM yields the name of the instruction and the encodings and +// meanings of its operands. +// +// 6. For each operand, its encoding is an entry from OperandEncoding +// (X86DisassemblerDecoderCommon.h) and its type is an entry from +// OperandType (ibid.). The encoding indicates how to read it from the +// instruction; the type indicates how to interpret the value once it has +// been read. For example, a register operand could be stored in the R/M +// field of the ModR/M byte, the REG field of the ModR/M byte, or added to +// the main opcode. This is orthogonal from its meaning (an GPR or an XMM +// register, for instance). Given this information, the operands can be +// extracted and interpreted. +// +// 7. As the last step, the disassembler translates the instruction information +// and operands into a format understandable by the client - in this case, an +// MCInst for use by the MC infrastructure. +// +// The disassembler is broken broadly into two parts: the table emitter that +// emits the instruction decode tables discussed above during compilation, and +// the disassembler itself. The table emitter is documented in more detail in +// utils/TableGen/X86DisassemblerEmitter.h. +// +// X86Disassembler.h contains the public interface for the disassembler, +// adhering to the MCDisassembler interface. +// X86Disassembler.cpp contains the code responsible for step 7, and for +// invoking the decoder to execute steps 1-6. +// X86DisassemblerDecoderCommon.h contains the definitions needed by both the +// table emitter and the disassembler. +// X86DisassemblerDecoder.h contains the public interface of the decoder, +// factored out into C for possible use by other projects. +// X86DisassemblerDecoder.c contains the source code of the decoder, which is +// responsible for steps 1-6. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLER_H +#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLER_H + +#include "X86DisassemblerDecoderCommon.h" +#include "llvm/MC/MCDisassembler.h" + +namespace llvm { + +class MCInst; +class MCInstrInfo; +class MCSubtargetInfo; +class MemoryObject; +class raw_ostream; + +namespace X86Disassembler { + +/// Generic disassembler for all X86 platforms. All each platform class should +/// have to do is subclass the constructor, and provide a different +/// disassemblerMode value. +class X86GenericDisassembler : public MCDisassembler { + std::unique_ptr<const MCInstrInfo> MII; +public: + X86GenericDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, + std::unique_ptr<const MCInstrInfo> MII); +public: + DecodeStatus getInstruction(MCInst &instr, uint64_t &size, + ArrayRef<uint8_t> Bytes, uint64_t Address, + raw_ostream &vStream, + raw_ostream &cStream) const override; + +private: + DisassemblerMode fMode; +}; + +} // namespace X86Disassembler + +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp new file mode 100644 index 0000000..040143b --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -0,0 +1,1909 @@ +//===-- X86DisassemblerDecoder.cpp - Disassembler decoder -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is part of the X86 Disassembler. +// It contains the implementation of the instruction decoder. +// Documentation for the disassembler can be found in X86Disassembler.h. +// +//===----------------------------------------------------------------------===// + +#include <cstdarg> /* for va_*() */ +#include <cstdio> /* for vsnprintf() */ +#include <cstdlib> /* for exit() */ +#include <cstring> /* for memset() */ + +#include "X86DisassemblerDecoder.h" + +using namespace llvm::X86Disassembler; + +/// Specifies whether a ModR/M byte is needed and (if so) which +/// instruction each possible value of the ModR/M byte corresponds to. Once +/// this information is known, we have narrowed down to a single instruction. +struct ModRMDecision { + uint8_t modrm_type; + uint16_t instructionIDs; +}; + +/// Specifies which set of ModR/M->instruction tables to look at +/// given a particular opcode. +struct OpcodeDecision { + ModRMDecision modRMDecisions[256]; +}; + +/// Specifies which opcode->instruction tables to look at given +/// a particular context (set of attributes). Since there are many possible +/// contexts, the decoder first uses CONTEXTS_SYM to determine which context +/// applies given a specific set of attributes. Hence there are only IC_max +/// entries in this table, rather than 2^(ATTR_max). +struct ContextDecision { + OpcodeDecision opcodeDecisions[IC_max]; +}; + +#include "X86GenDisassemblerTables.inc" + +#ifndef NDEBUG +#define debug(s) do { Debug(__FILE__, __LINE__, s); } while (0) +#else +#define debug(s) do { } while (0) +#endif + + +/* + * contextForAttrs - Client for the instruction context table. Takes a set of + * attributes and returns the appropriate decode context. + * + * @param attrMask - Attributes, from the enumeration attributeBits. + * @return - The InstructionContext to use when looking up an + * an instruction with these attributes. + */ +static InstructionContext contextForAttrs(uint16_t attrMask) { + return static_cast<InstructionContext>(CONTEXTS_SYM[attrMask]); +} + +/* + * modRMRequired - Reads the appropriate instruction table to determine whether + * the ModR/M byte is required to decode a particular instruction. + * + * @param type - The opcode type (i.e., how many bytes it has). + * @param insnContext - The context for the instruction, as returned by + * contextForAttrs. + * @param opcode - The last byte of the instruction's opcode, not counting + * ModR/M extensions and escapes. + * @return - true if the ModR/M byte is required, false otherwise. + */ +static int modRMRequired(OpcodeType type, + InstructionContext insnContext, + uint16_t opcode) { + const struct ContextDecision* decision = nullptr; + + switch (type) { + case ONEBYTE: + decision = &ONEBYTE_SYM; + break; + case TWOBYTE: + decision = &TWOBYTE_SYM; + break; + case THREEBYTE_38: + decision = &THREEBYTE38_SYM; + break; + case THREEBYTE_3A: + decision = &THREEBYTE3A_SYM; + break; + case XOP8_MAP: + decision = &XOP8_MAP_SYM; + break; + case XOP9_MAP: + decision = &XOP9_MAP_SYM; + break; + case XOPA_MAP: + decision = &XOPA_MAP_SYM; + break; + } + + return decision->opcodeDecisions[insnContext].modRMDecisions[opcode]. + modrm_type != MODRM_ONEENTRY; +} + +/* + * decode - Reads the appropriate instruction table to obtain the unique ID of + * an instruction. + * + * @param type - See modRMRequired(). + * @param insnContext - See modRMRequired(). + * @param opcode - See modRMRequired(). + * @param modRM - The ModR/M byte if required, or any value if not. + * @return - The UID of the instruction, or 0 on failure. + */ +static InstrUID decode(OpcodeType type, + InstructionContext insnContext, + uint8_t opcode, + uint8_t modRM) { + const struct ModRMDecision* dec = nullptr; + + switch (type) { + case ONEBYTE: + dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case TWOBYTE: + dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case THREEBYTE_38: + dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case THREEBYTE_3A: + dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case XOP8_MAP: + dec = &XOP8_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case XOP9_MAP: + dec = &XOP9_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case XOPA_MAP: + dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + } + + switch (dec->modrm_type) { + default: + debug("Corrupt table! Unknown modrm_type"); + return 0; + case MODRM_ONEENTRY: + return modRMTable[dec->instructionIDs]; + case MODRM_SPLITRM: + if (modFromModRM(modRM) == 0x3) + return modRMTable[dec->instructionIDs+1]; + return modRMTable[dec->instructionIDs]; + case MODRM_SPLITREG: + if (modFromModRM(modRM) == 0x3) + return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)+8]; + return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)]; + case MODRM_SPLITMISC: + if (modFromModRM(modRM) == 0x3) + return modRMTable[dec->instructionIDs+(modRM & 0x3f)+8]; + return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)]; + case MODRM_FULL: + return modRMTable[dec->instructionIDs+modRM]; + } +} + +/* + * specifierForUID - Given a UID, returns the name and operand specification for + * that instruction. + * + * @param uid - The unique ID for the instruction. This should be returned by + * decode(); specifierForUID will not check bounds. + * @return - A pointer to the specification for that instruction. + */ +static const struct InstructionSpecifier *specifierForUID(InstrUID uid) { + return &INSTRUCTIONS_SYM[uid]; +} + +/* + * consumeByte - Uses the reader function provided by the user to consume one + * byte from the instruction's memory and advance the cursor. + * + * @param insn - The instruction with the reader function to use. The cursor + * for this instruction is advanced. + * @param byte - A pointer to a pre-allocated memory buffer to be populated + * with the data read. + * @return - 0 if the read was successful; nonzero otherwise. + */ +static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) { + int ret = insn->reader(insn->readerArg, byte, insn->readerCursor); + + if (!ret) + ++(insn->readerCursor); + + return ret; +} + +/* + * lookAtByte - Like consumeByte, but does not advance the cursor. + * + * @param insn - See consumeByte(). + * @param byte - See consumeByte(). + * @return - See consumeByte(). + */ +static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) { + return insn->reader(insn->readerArg, byte, insn->readerCursor); +} + +static void unconsumeByte(struct InternalInstruction* insn) { + insn->readerCursor--; +} + +#define CONSUME_FUNC(name, type) \ + static int name(struct InternalInstruction* insn, type* ptr) { \ + type combined = 0; \ + unsigned offset; \ + for (offset = 0; offset < sizeof(type); ++offset) { \ + uint8_t byte; \ + int ret = insn->reader(insn->readerArg, \ + &byte, \ + insn->readerCursor + offset); \ + if (ret) \ + return ret; \ + combined = combined | ((uint64_t)byte << (offset * 8)); \ + } \ + *ptr = combined; \ + insn->readerCursor += sizeof(type); \ + return 0; \ + } + +/* + * consume* - Use the reader function provided by the user to consume data + * values of various sizes from the instruction's memory and advance the + * cursor appropriately. These readers perform endian conversion. + * + * @param insn - See consumeByte(). + * @param ptr - A pointer to a pre-allocated memory of appropriate size to + * be populated with the data read. + * @return - See consumeByte(). + */ +CONSUME_FUNC(consumeInt8, int8_t) +CONSUME_FUNC(consumeInt16, int16_t) +CONSUME_FUNC(consumeInt32, int32_t) +CONSUME_FUNC(consumeUInt16, uint16_t) +CONSUME_FUNC(consumeUInt32, uint32_t) +CONSUME_FUNC(consumeUInt64, uint64_t) + +/* + * dbgprintf - Uses the logging function provided by the user to log a single + * message, typically without a carriage-return. + * + * @param insn - The instruction containing the logging function. + * @param format - See printf(). + * @param ... - See printf(). + */ +static void dbgprintf(struct InternalInstruction* insn, + const char* format, + ...) { + char buffer[256]; + va_list ap; + + if (!insn->dlog) + return; + + va_start(ap, format); + (void)vsnprintf(buffer, sizeof(buffer), format, ap); + va_end(ap); + + insn->dlog(insn->dlogArg, buffer); + + return; +} + +/* + * setPrefixPresent - Marks that a particular prefix is present at a particular + * location. + * + * @param insn - The instruction to be marked as having the prefix. + * @param prefix - The prefix that is present. + * @param location - The location where the prefix is located (in the address + * space of the instruction's reader). + */ +static void setPrefixPresent(struct InternalInstruction* insn, + uint8_t prefix, + uint64_t location) +{ + insn->prefixPresent[prefix] = 1; + insn->prefixLocations[prefix] = location; +} + +/* + * isPrefixAtLocation - Queries an instruction to determine whether a prefix is + * present at a given location. + * + * @param insn - The instruction to be queried. + * @param prefix - The prefix. + * @param location - The location to query. + * @return - Whether the prefix is at that location. + */ +static bool isPrefixAtLocation(struct InternalInstruction* insn, + uint8_t prefix, + uint64_t location) +{ + return insn->prefixPresent[prefix] == 1 && + insn->prefixLocations[prefix] == location; +} + +/* + * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the + * instruction as having them. Also sets the instruction's default operand, + * address, and other relevant data sizes to report operands correctly. + * + * @param insn - The instruction whose prefixes are to be read. + * @return - 0 if the instruction could be read until the end of the prefix + * bytes, and no prefixes conflicted; nonzero otherwise. + */ +static int readPrefixes(struct InternalInstruction* insn) { + bool isPrefix = true; + bool prefixGroups[4] = { false }; + uint64_t prefixLocation; + uint8_t byte = 0; + uint8_t nextByte; + + bool hasAdSize = false; + bool hasOpSize = false; + + dbgprintf(insn, "readPrefixes()"); + + while (isPrefix) { + prefixLocation = insn->readerCursor; + + /* If we fail reading prefixes, just stop here and let the opcode reader deal with it */ + if (consumeByte(insn, &byte)) + break; + + /* + * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then + * break and let it be disassembled as a normal "instruction". + */ + if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0) + break; + + if (insn->readerCursor - 1 == insn->startLocation + && (byte == 0xf2 || byte == 0xf3) + && !lookAtByte(insn, &nextByte)) + { + /* + * If the byte is 0xf2 or 0xf3, and any of the following conditions are + * met: + * - it is followed by a LOCK (0xf0) prefix + * - it is followed by an xchg instruction + * then it should be disassembled as a xacquire/xrelease not repne/rep. + */ + if ((byte == 0xf2 || byte == 0xf3) && + ((nextByte == 0xf0) || + ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90))) + insn->xAcquireRelease = true; + /* + * Also if the byte is 0xf3, and the following condition is met: + * - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or + * "mov mem, imm" (opcode 0xc6/0xc7) instructions. + * then it should be disassembled as an xrelease not rep. + */ + if (byte == 0xf3 && + (nextByte == 0x88 || nextByte == 0x89 || + nextByte == 0xc6 || nextByte == 0xc7)) + insn->xAcquireRelease = true; + if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) { + if (consumeByte(insn, &nextByte)) + return -1; + if (lookAtByte(insn, &nextByte)) + return -1; + unconsumeByte(insn); + } + if (nextByte != 0x0f && nextByte != 0x90) + break; + } + + switch (byte) { + case 0xf0: /* LOCK */ + case 0xf2: /* REPNE/REPNZ */ + case 0xf3: /* REP or REPE/REPZ */ + if (prefixGroups[0]) + dbgprintf(insn, "Redundant Group 1 prefix"); + prefixGroups[0] = true; + setPrefixPresent(insn, byte, prefixLocation); + break; + case 0x2e: /* CS segment override -OR- Branch not taken */ + case 0x36: /* SS segment override -OR- Branch taken */ + case 0x3e: /* DS segment override */ + case 0x26: /* ES segment override */ + case 0x64: /* FS segment override */ + case 0x65: /* GS segment override */ + switch (byte) { + case 0x2e: + insn->segmentOverride = SEG_OVERRIDE_CS; + break; + case 0x36: + insn->segmentOverride = SEG_OVERRIDE_SS; + break; + case 0x3e: + insn->segmentOverride = SEG_OVERRIDE_DS; + break; + case 0x26: + insn->segmentOverride = SEG_OVERRIDE_ES; + break; + case 0x64: + insn->segmentOverride = SEG_OVERRIDE_FS; + break; + case 0x65: + insn->segmentOverride = SEG_OVERRIDE_GS; + break; + default: + debug("Unhandled override"); + return -1; + } + if (prefixGroups[1]) + dbgprintf(insn, "Redundant Group 2 prefix"); + prefixGroups[1] = true; + setPrefixPresent(insn, byte, prefixLocation); + break; + case 0x66: /* Operand-size override */ + if (prefixGroups[2]) + dbgprintf(insn, "Redundant Group 3 prefix"); + prefixGroups[2] = true; + hasOpSize = true; + setPrefixPresent(insn, byte, prefixLocation); + break; + case 0x67: /* Address-size override */ + if (prefixGroups[3]) + dbgprintf(insn, "Redundant Group 4 prefix"); + prefixGroups[3] = true; + hasAdSize = true; + setPrefixPresent(insn, byte, prefixLocation); + break; + default: /* Not a prefix byte */ + isPrefix = false; + break; + } + + if (isPrefix) + dbgprintf(insn, "Found prefix 0x%hhx", byte); + } + + insn->vectorExtensionType = TYPE_NO_VEX_XOP; + + if (byte == 0x62) { + uint8_t byte1, byte2; + + if (consumeByte(insn, &byte1)) { + dbgprintf(insn, "Couldn't read second byte of EVEX prefix"); + return -1; + } + + if (lookAtByte(insn, &byte2)) { + dbgprintf(insn, "Couldn't read third byte of EVEX prefix"); + return -1; + } + + if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) && + ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) { + insn->vectorExtensionType = TYPE_EVEX; + } else { + unconsumeByte(insn); /* unconsume byte1 */ + unconsumeByte(insn); /* unconsume byte */ + insn->necessaryPrefixLocation = insn->readerCursor - 2; + } + + if (insn->vectorExtensionType == TYPE_EVEX) { + insn->vectorExtensionPrefix[0] = byte; + insn->vectorExtensionPrefix[1] = byte1; + if (consumeByte(insn, &insn->vectorExtensionPrefix[2])) { + dbgprintf(insn, "Couldn't read third byte of EVEX prefix"); + return -1; + } + if (consumeByte(insn, &insn->vectorExtensionPrefix[3])) { + dbgprintf(insn, "Couldn't read fourth byte of EVEX prefix"); + return -1; + } + + /* We simulate the REX prefix for simplicity's sake */ + if (insn->mode == MODE_64BIT) { + insn->rexPrefix = 0x40 + | (wFromEVEX3of4(insn->vectorExtensionPrefix[2]) << 3) + | (rFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 2) + | (xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 1) + | (bFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 0); + } + + dbgprintf(insn, "Found EVEX prefix 0x%hhx 0x%hhx 0x%hhx 0x%hhx", + insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1], + insn->vectorExtensionPrefix[2], insn->vectorExtensionPrefix[3]); + } + } else if (byte == 0xc4) { + uint8_t byte1; + + if (lookAtByte(insn, &byte1)) { + dbgprintf(insn, "Couldn't read second byte of VEX"); + return -1; + } + + if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) { + insn->vectorExtensionType = TYPE_VEX_3B; + insn->necessaryPrefixLocation = insn->readerCursor - 1; + } else { + unconsumeByte(insn); + insn->necessaryPrefixLocation = insn->readerCursor - 1; + } + + if (insn->vectorExtensionType == TYPE_VEX_3B) { + insn->vectorExtensionPrefix[0] = byte; + consumeByte(insn, &insn->vectorExtensionPrefix[1]); + consumeByte(insn, &insn->vectorExtensionPrefix[2]); + + /* We simulate the REX prefix for simplicity's sake */ + + if (insn->mode == MODE_64BIT) { + insn->rexPrefix = 0x40 + | (wFromVEX3of3(insn->vectorExtensionPrefix[2]) << 3) + | (rFromVEX2of3(insn->vectorExtensionPrefix[1]) << 2) + | (xFromVEX2of3(insn->vectorExtensionPrefix[1]) << 1) + | (bFromVEX2of3(insn->vectorExtensionPrefix[1]) << 0); + } + + dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx", + insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1], + insn->vectorExtensionPrefix[2]); + } + } else if (byte == 0xc5) { + uint8_t byte1; + + if (lookAtByte(insn, &byte1)) { + dbgprintf(insn, "Couldn't read second byte of VEX"); + return -1; + } + + if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) { + insn->vectorExtensionType = TYPE_VEX_2B; + } else { + unconsumeByte(insn); + } + + if (insn->vectorExtensionType == TYPE_VEX_2B) { + insn->vectorExtensionPrefix[0] = byte; + consumeByte(insn, &insn->vectorExtensionPrefix[1]); + + if (insn->mode == MODE_64BIT) { + insn->rexPrefix = 0x40 + | (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2); + } + + switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) { + default: + break; + case VEX_PREFIX_66: + hasOpSize = true; + break; + } + + dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx", + insn->vectorExtensionPrefix[0], + insn->vectorExtensionPrefix[1]); + } + } else if (byte == 0x8f) { + uint8_t byte1; + + if (lookAtByte(insn, &byte1)) { + dbgprintf(insn, "Couldn't read second byte of XOP"); + return -1; + } + + if ((byte1 & 0x38) != 0x0) { /* 0 in these 3 bits is a POP instruction. */ + insn->vectorExtensionType = TYPE_XOP; + insn->necessaryPrefixLocation = insn->readerCursor - 1; + } else { + unconsumeByte(insn); + insn->necessaryPrefixLocation = insn->readerCursor - 1; + } + + if (insn->vectorExtensionType == TYPE_XOP) { + insn->vectorExtensionPrefix[0] = byte; + consumeByte(insn, &insn->vectorExtensionPrefix[1]); + consumeByte(insn, &insn->vectorExtensionPrefix[2]); + + /* We simulate the REX prefix for simplicity's sake */ + + if (insn->mode == MODE_64BIT) { + insn->rexPrefix = 0x40 + | (wFromXOP3of3(insn->vectorExtensionPrefix[2]) << 3) + | (rFromXOP2of3(insn->vectorExtensionPrefix[1]) << 2) + | (xFromXOP2of3(insn->vectorExtensionPrefix[1]) << 1) + | (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0); + } + + switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) { + default: + break; + case VEX_PREFIX_66: + hasOpSize = true; + break; + } + + dbgprintf(insn, "Found XOP prefix 0x%hhx 0x%hhx 0x%hhx", + insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1], + insn->vectorExtensionPrefix[2]); + } + } else { + if (insn->mode == MODE_64BIT) { + if ((byte & 0xf0) == 0x40) { + uint8_t opcodeByte; + + if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) { + dbgprintf(insn, "Redundant REX prefix"); + return -1; + } + + insn->rexPrefix = byte; + insn->necessaryPrefixLocation = insn->readerCursor - 2; + + dbgprintf(insn, "Found REX prefix 0x%hhx", byte); + } else { + unconsumeByte(insn); + insn->necessaryPrefixLocation = insn->readerCursor - 1; + } + } else { + unconsumeByte(insn); + insn->necessaryPrefixLocation = insn->readerCursor - 1; + } + } + + if (insn->mode == MODE_16BIT) { + insn->registerSize = (hasOpSize ? 4 : 2); + insn->addressSize = (hasAdSize ? 4 : 2); + insn->displacementSize = (hasAdSize ? 4 : 2); + insn->immediateSize = (hasOpSize ? 4 : 2); + } else if (insn->mode == MODE_32BIT) { + insn->registerSize = (hasOpSize ? 2 : 4); + insn->addressSize = (hasAdSize ? 2 : 4); + insn->displacementSize = (hasAdSize ? 2 : 4); + insn->immediateSize = (hasOpSize ? 2 : 4); + } else if (insn->mode == MODE_64BIT) { + if (insn->rexPrefix && wFromREX(insn->rexPrefix)) { + insn->registerSize = 8; + insn->addressSize = (hasAdSize ? 4 : 8); + insn->displacementSize = 4; + insn->immediateSize = 4; + } else if (insn->rexPrefix) { + insn->registerSize = (hasOpSize ? 2 : 4); + insn->addressSize = (hasAdSize ? 4 : 8); + insn->displacementSize = (hasOpSize ? 2 : 4); + insn->immediateSize = (hasOpSize ? 2 : 4); + } else { + insn->registerSize = (hasOpSize ? 2 : 4); + insn->addressSize = (hasAdSize ? 4 : 8); + insn->displacementSize = (hasOpSize ? 2 : 4); + insn->immediateSize = (hasOpSize ? 2 : 4); + } + } + + return 0; +} + +/* + * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of + * extended or escape opcodes). + * + * @param insn - The instruction whose opcode is to be read. + * @return - 0 if the opcode could be read successfully; nonzero otherwise. + */ +static int readOpcode(struct InternalInstruction* insn) { + /* Determine the length of the primary opcode */ + + uint8_t current; + + dbgprintf(insn, "readOpcode()"); + + insn->opcodeType = ONEBYTE; + + if (insn->vectorExtensionType == TYPE_EVEX) { + switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) { + default: + dbgprintf(insn, "Unhandled mm field for instruction (0x%hhx)", + mmFromEVEX2of4(insn->vectorExtensionPrefix[1])); + return -1; + case VEX_LOB_0F: + insn->opcodeType = TWOBYTE; + return consumeByte(insn, &insn->opcode); + case VEX_LOB_0F38: + insn->opcodeType = THREEBYTE_38; + return consumeByte(insn, &insn->opcode); + case VEX_LOB_0F3A: + insn->opcodeType = THREEBYTE_3A; + return consumeByte(insn, &insn->opcode); + } + } else if (insn->vectorExtensionType == TYPE_VEX_3B) { + switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) { + default: + dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", + mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])); + return -1; + case VEX_LOB_0F: + insn->opcodeType = TWOBYTE; + return consumeByte(insn, &insn->opcode); + case VEX_LOB_0F38: + insn->opcodeType = THREEBYTE_38; + return consumeByte(insn, &insn->opcode); + case VEX_LOB_0F3A: + insn->opcodeType = THREEBYTE_3A; + return consumeByte(insn, &insn->opcode); + } + } else if (insn->vectorExtensionType == TYPE_VEX_2B) { + insn->opcodeType = TWOBYTE; + return consumeByte(insn, &insn->opcode); + } else if (insn->vectorExtensionType == TYPE_XOP) { + switch (mmmmmFromXOP2of3(insn->vectorExtensionPrefix[1])) { + default: + dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", + mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])); + return -1; + case XOP_MAP_SELECT_8: + insn->opcodeType = XOP8_MAP; + return consumeByte(insn, &insn->opcode); + case XOP_MAP_SELECT_9: + insn->opcodeType = XOP9_MAP; + return consumeByte(insn, &insn->opcode); + case XOP_MAP_SELECT_A: + insn->opcodeType = XOPA_MAP; + return consumeByte(insn, &insn->opcode); + } + } + + if (consumeByte(insn, ¤t)) + return -1; + + if (current == 0x0f) { + dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current); + + if (consumeByte(insn, ¤t)) + return -1; + + if (current == 0x38) { + dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); + + if (consumeByte(insn, ¤t)) + return -1; + + insn->opcodeType = THREEBYTE_38; + } else if (current == 0x3a) { + dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); + + if (consumeByte(insn, ¤t)) + return -1; + + insn->opcodeType = THREEBYTE_3A; + } else { + dbgprintf(insn, "Didn't find a three-byte escape prefix"); + + insn->opcodeType = TWOBYTE; + } + } + + /* + * At this point we have consumed the full opcode. + * Anything we consume from here on must be unconsumed. + */ + + insn->opcode = current; + + return 0; +} + +static int readModRM(struct InternalInstruction* insn); + +/* + * getIDWithAttrMask - Determines the ID of an instruction, consuming + * the ModR/M byte as appropriate for extended and escape opcodes, + * and using a supplied attribute mask. + * + * @param instructionID - A pointer whose target is filled in with the ID of the + * instruction. + * @param insn - The instruction whose ID is to be determined. + * @param attrMask - The attribute mask to search. + * @return - 0 if the ModR/M could be read when needed or was not + * needed; nonzero otherwise. + */ +static int getIDWithAttrMask(uint16_t* instructionID, + struct InternalInstruction* insn, + uint16_t attrMask) { + bool hasModRMExtension; + + InstructionContext instructionClass = contextForAttrs(attrMask); + + hasModRMExtension = modRMRequired(insn->opcodeType, + instructionClass, + insn->opcode); + + if (hasModRMExtension) { + if (readModRM(insn)) + return -1; + + *instructionID = decode(insn->opcodeType, + instructionClass, + insn->opcode, + insn->modRM); + } else { + *instructionID = decode(insn->opcodeType, + instructionClass, + insn->opcode, + 0); + } + + return 0; +} + +/* + * is16BitEquivalent - Determines whether two instruction names refer to + * equivalent instructions but one is 16-bit whereas the other is not. + * + * @param orig - The instruction that is not 16-bit + * @param equiv - The instruction that is 16-bit + */ +static bool is16BitEquivalent(const char* orig, const char* equiv) { + off_t i; + + for (i = 0;; i++) { + if (orig[i] == '\0' && equiv[i] == '\0') + return true; + if (orig[i] == '\0' || equiv[i] == '\0') + return false; + if (orig[i] != equiv[i]) { + if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W') + continue; + if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1') + continue; + if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6') + continue; + return false; + } + } +} + +/* + * is64Bit - Determines whether this instruction is a 64-bit instruction. + * + * @param name - The instruction that is not 16-bit + */ +static bool is64Bit(const char* name) { + off_t i; + + for (i = 0;; ++i) { + if (name[i] == '\0') + return false; + if (name[i] == '6' && name[i+1] == '4') + return true; + } +} + +/* + * getID - Determines the ID of an instruction, consuming the ModR/M byte as + * appropriate for extended and escape opcodes. Determines the attributes and + * context for the instruction before doing so. + * + * @param insn - The instruction whose ID is to be determined. + * @return - 0 if the ModR/M could be read when needed or was not needed; + * nonzero otherwise. + */ +static int getID(struct InternalInstruction* insn, const void *miiArg) { + uint16_t attrMask; + uint16_t instructionID; + + dbgprintf(insn, "getID()"); + + attrMask = ATTR_NONE; + + if (insn->mode == MODE_64BIT) + attrMask |= ATTR_64BIT; + + if (insn->vectorExtensionType != TYPE_NO_VEX_XOP) { + attrMask |= (insn->vectorExtensionType == TYPE_EVEX) ? ATTR_EVEX : ATTR_VEX; + + if (insn->vectorExtensionType == TYPE_EVEX) { + switch (ppFromEVEX3of4(insn->vectorExtensionPrefix[2])) { + case VEX_PREFIX_66: + attrMask |= ATTR_OPSIZE; + break; + case VEX_PREFIX_F3: + attrMask |= ATTR_XS; + break; + case VEX_PREFIX_F2: + attrMask |= ATTR_XD; + break; + } + + if (zFromEVEX4of4(insn->vectorExtensionPrefix[3])) + attrMask |= ATTR_EVEXKZ; + if (bFromEVEX4of4(insn->vectorExtensionPrefix[3])) + attrMask |= ATTR_EVEXB; + if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3])) + attrMask |= ATTR_EVEXK; + if (lFromEVEX4of4(insn->vectorExtensionPrefix[3])) + attrMask |= ATTR_EVEXL; + if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3])) + attrMask |= ATTR_EVEXL2; + } else if (insn->vectorExtensionType == TYPE_VEX_3B) { + switch (ppFromVEX3of3(insn->vectorExtensionPrefix[2])) { + case VEX_PREFIX_66: + attrMask |= ATTR_OPSIZE; + break; + case VEX_PREFIX_F3: + attrMask |= ATTR_XS; + break; + case VEX_PREFIX_F2: + attrMask |= ATTR_XD; + break; + } + + if (lFromVEX3of3(insn->vectorExtensionPrefix[2])) + attrMask |= ATTR_VEXL; + } else if (insn->vectorExtensionType == TYPE_VEX_2B) { + switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) { + case VEX_PREFIX_66: + attrMask |= ATTR_OPSIZE; + break; + case VEX_PREFIX_F3: + attrMask |= ATTR_XS; + break; + case VEX_PREFIX_F2: + attrMask |= ATTR_XD; + break; + } + + if (lFromVEX2of2(insn->vectorExtensionPrefix[1])) + attrMask |= ATTR_VEXL; + } else if (insn->vectorExtensionType == TYPE_XOP) { + switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) { + case VEX_PREFIX_66: + attrMask |= ATTR_OPSIZE; + break; + case VEX_PREFIX_F3: + attrMask |= ATTR_XS; + break; + case VEX_PREFIX_F2: + attrMask |= ATTR_XD; + break; + } + + if (lFromXOP3of3(insn->vectorExtensionPrefix[2])) + attrMask |= ATTR_VEXL; + } else { + return -1; + } + } else { + if (insn->mode != MODE_16BIT && isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) + attrMask |= ATTR_OPSIZE; + else if (isPrefixAtLocation(insn, 0x67, insn->necessaryPrefixLocation)) + attrMask |= ATTR_ADSIZE; + else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation)) + attrMask |= ATTR_XS; + else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation)) + attrMask |= ATTR_XD; + } + + if (insn->rexPrefix & 0x08) + attrMask |= ATTR_REXW; + + /* + * JCXZ/JECXZ need special handling for 16-bit mode because the meaning + * of the AdSize prefix is inverted w.r.t. 32-bit mode. + */ + if (insn->mode == MODE_16BIT && insn->opcodeType == ONEBYTE && + insn->opcode == 0xE3) + attrMask ^= ATTR_ADSIZE; + + /* + * In 64-bit mode all f64 superscripted opcodes ignore opcode size prefix + * CALL/JMP/JCC instructions need to ignore 0x66 and consume 4 bytes + */ + + if (insn->mode == MODE_64BIT && + isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) { + switch (insn->opcode) { + case 0xE8: + case 0xE9: + // Take care of psubsb and other mmx instructions. + if (insn->opcodeType == ONEBYTE) { + attrMask ^= ATTR_OPSIZE; + insn->immediateSize = 4; + insn->displacementSize = 4; + } + break; + case 0x82: + case 0x83: + case 0x84: + case 0x85: + case 0x86: + case 0x87: + case 0x88: + case 0x89: + case 0x8A: + case 0x8B: + case 0x8C: + case 0x8D: + case 0x8E: + case 0x8F: + // Take care of lea and three byte ops. + if (insn->opcodeType == TWOBYTE) { + attrMask ^= ATTR_OPSIZE; + insn->immediateSize = 4; + insn->displacementSize = 4; + } + break; + } + } + + if (getIDWithAttrMask(&instructionID, insn, attrMask)) + return -1; + + /* The following clauses compensate for limitations of the tables. */ + + if (insn->mode != MODE_64BIT && + insn->vectorExtensionType != TYPE_NO_VEX_XOP) { + /* + * The tables can't distinquish between cases where the W-bit is used to + * select register size and cases where its a required part of the opcode. + */ + if ((insn->vectorExtensionType == TYPE_EVEX && + wFromEVEX3of4(insn->vectorExtensionPrefix[2])) || + (insn->vectorExtensionType == TYPE_VEX_3B && + wFromVEX3of3(insn->vectorExtensionPrefix[2])) || + (insn->vectorExtensionType == TYPE_XOP && + wFromXOP3of3(insn->vectorExtensionPrefix[2]))) { + + uint16_t instructionIDWithREXW; + if (getIDWithAttrMask(&instructionIDWithREXW, + insn, attrMask | ATTR_REXW)) { + insn->instructionID = instructionID; + insn->spec = specifierForUID(instructionID); + return 0; + } + + const char *SpecName = GetInstrName(instructionIDWithREXW, miiArg); + // If not a 64-bit instruction. Switch the opcode. + if (!is64Bit(SpecName)) { + insn->instructionID = instructionIDWithREXW; + insn->spec = specifierForUID(instructionIDWithREXW); + return 0; + } + } + } + + /* + * Absolute moves need special handling. + * -For 16-bit mode because the meaning of the AdSize and OpSize prefixes are + * inverted w.r.t. + * -For 32-bit mode we need to ensure the ADSIZE prefix is observed in + * any position. + */ + if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) { + /* Make sure we observed the prefixes in any position. */ + if (insn->prefixPresent[0x67]) + attrMask |= ATTR_ADSIZE; + if (insn->prefixPresent[0x66]) + attrMask |= ATTR_OPSIZE; + + /* In 16-bit, invert the attributes. */ + if (insn->mode == MODE_16BIT) + attrMask ^= ATTR_ADSIZE | ATTR_OPSIZE; + + if (getIDWithAttrMask(&instructionID, insn, attrMask)) + return -1; + + insn->instructionID = instructionID; + insn->spec = specifierForUID(instructionID); + return 0; + } + + if ((insn->mode == MODE_16BIT || insn->prefixPresent[0x66]) && + !(attrMask & ATTR_OPSIZE)) { + /* + * The instruction tables make no distinction between instructions that + * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a + * particular spot (i.e., many MMX operations). In general we're + * conservative, but in the specific case where OpSize is present but not + * in the right place we check if there's a 16-bit operation. + */ + + const struct InstructionSpecifier *spec; + uint16_t instructionIDWithOpsize; + const char *specName, *specWithOpSizeName; + + spec = specifierForUID(instructionID); + + if (getIDWithAttrMask(&instructionIDWithOpsize, + insn, + attrMask | ATTR_OPSIZE)) { + /* + * ModRM required with OpSize but not present; give up and return version + * without OpSize set + */ + + insn->instructionID = instructionID; + insn->spec = spec; + return 0; + } + + specName = GetInstrName(instructionID, miiArg); + specWithOpSizeName = GetInstrName(instructionIDWithOpsize, miiArg); + + if (is16BitEquivalent(specName, specWithOpSizeName) && + (insn->mode == MODE_16BIT) ^ insn->prefixPresent[0x66]) { + insn->instructionID = instructionIDWithOpsize; + insn->spec = specifierForUID(instructionIDWithOpsize); + } else { + insn->instructionID = instructionID; + insn->spec = spec; + } + return 0; + } + + if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 && + insn->rexPrefix & 0x01) { + /* + * NOOP shouldn't decode as NOOP if REX.b is set. Instead + * it should decode as XCHG %r8, %eax. + */ + + const struct InstructionSpecifier *spec; + uint16_t instructionIDWithNewOpcode; + const struct InstructionSpecifier *specWithNewOpcode; + + spec = specifierForUID(instructionID); + + /* Borrow opcode from one of the other XCHGar opcodes */ + insn->opcode = 0x91; + + if (getIDWithAttrMask(&instructionIDWithNewOpcode, + insn, + attrMask)) { + insn->opcode = 0x90; + + insn->instructionID = instructionID; + insn->spec = spec; + return 0; + } + + specWithNewOpcode = specifierForUID(instructionIDWithNewOpcode); + + /* Change back */ + insn->opcode = 0x90; + + insn->instructionID = instructionIDWithNewOpcode; + insn->spec = specWithNewOpcode; + + return 0; + } + + insn->instructionID = instructionID; + insn->spec = specifierForUID(insn->instructionID); + + return 0; +} + +/* + * readSIB - Consumes the SIB byte to determine addressing information for an + * instruction. + * + * @param insn - The instruction whose SIB byte is to be read. + * @return - 0 if the SIB byte was successfully read; nonzero otherwise. + */ +static int readSIB(struct InternalInstruction* insn) { + SIBIndex sibIndexBase = SIB_INDEX_NONE; + SIBBase sibBaseBase = SIB_BASE_NONE; + uint8_t index, base; + + dbgprintf(insn, "readSIB()"); + + if (insn->consumedSIB) + return 0; + + insn->consumedSIB = true; + + switch (insn->addressSize) { + case 2: + dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode"); + return -1; + case 4: + sibIndexBase = SIB_INDEX_EAX; + sibBaseBase = SIB_BASE_EAX; + break; + case 8: + sibIndexBase = SIB_INDEX_RAX; + sibBaseBase = SIB_BASE_RAX; + break; + } + + if (consumeByte(insn, &insn->sib)) + return -1; + + index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3); + + // FIXME: The fifth bit (bit index 4) is only to be used for instructions + // that understand VSIB indexing. ORing the bit in here is mildy dangerous + // because performing math on an 'enum SIBIndex' can produce garbage. + // Excluding the "none" value, it should cover 6 spaces of register names: + // - 16 possibilities for 16-bit GPR starting at SIB_INDEX_BX_SI + // - 16 possibilities for 32-bit GPR starting at SIB_INDEX_EAX + // - 16 possibilities for 64-bit GPR starting at SIB_INDEX_RAX + // - 32 possibilities for each of XMM, YMM, ZMM registers + // When sibIndexBase gets assigned SIB_INDEX_RAX as it does in 64-bit mode, + // summing in a fully decoded index between 0 and 31 can end up with a value + // that looks like something in the low half of the XMM range. + // translateRMMemory() tries to reverse the damage, with only partial success, + // as evidenced by known bugs in "test/MC/Disassembler/X86/x86-64.txt" + if (insn->vectorExtensionType == TYPE_EVEX) + index |= v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4; + + if (index == 0x4) { + insn->sibIndex = SIB_INDEX_NONE; + } else { + insn->sibIndex = (SIBIndex)(sibIndexBase + index); + } + + insn->sibScale = 1 << scaleFromSIB(insn->sib); + + base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3); + + switch (base) { + case 0x5: + case 0xd: + switch (modFromModRM(insn->modRM)) { + case 0x0: + insn->eaDisplacement = EA_DISP_32; + insn->sibBase = SIB_BASE_NONE; + break; + case 0x1: + insn->eaDisplacement = EA_DISP_8; + insn->sibBase = (SIBBase)(sibBaseBase + base); + break; + case 0x2: + insn->eaDisplacement = EA_DISP_32; + insn->sibBase = (SIBBase)(sibBaseBase + base); + break; + case 0x3: + debug("Cannot have Mod = 0b11 and a SIB byte"); + return -1; + } + break; + default: + insn->sibBase = (SIBBase)(sibBaseBase + base); + break; + } + + return 0; +} + +/* + * readDisplacement - Consumes the displacement of an instruction. + * + * @param insn - The instruction whose displacement is to be read. + * @return - 0 if the displacement byte was successfully read; nonzero + * otherwise. + */ +static int readDisplacement(struct InternalInstruction* insn) { + int8_t d8; + int16_t d16; + int32_t d32; + + dbgprintf(insn, "readDisplacement()"); + + if (insn->consumedDisplacement) + return 0; + + insn->consumedDisplacement = true; + insn->displacementOffset = insn->readerCursor - insn->startLocation; + + switch (insn->eaDisplacement) { + case EA_DISP_NONE: + insn->consumedDisplacement = false; + break; + case EA_DISP_8: + if (consumeInt8(insn, &d8)) + return -1; + insn->displacement = d8; + break; + case EA_DISP_16: + if (consumeInt16(insn, &d16)) + return -1; + insn->displacement = d16; + break; + case EA_DISP_32: + if (consumeInt32(insn, &d32)) + return -1; + insn->displacement = d32; + break; + } + + insn->consumedDisplacement = true; + return 0; +} + +/* + * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and + * displacement) for an instruction and interprets it. + * + * @param insn - The instruction whose addressing information is to be read. + * @return - 0 if the information was successfully read; nonzero otherwise. + */ +static int readModRM(struct InternalInstruction* insn) { + uint8_t mod, rm, reg; + + dbgprintf(insn, "readModRM()"); + + if (insn->consumedModRM) + return 0; + + if (consumeByte(insn, &insn->modRM)) + return -1; + insn->consumedModRM = true; + + mod = modFromModRM(insn->modRM); + rm = rmFromModRM(insn->modRM); + reg = regFromModRM(insn->modRM); + + /* + * This goes by insn->registerSize to pick the correct register, which messes + * up if we're using (say) XMM or 8-bit register operands. That gets fixed in + * fixupReg(). + */ + switch (insn->registerSize) { + case 2: + insn->regBase = MODRM_REG_AX; + insn->eaRegBase = EA_REG_AX; + break; + case 4: + insn->regBase = MODRM_REG_EAX; + insn->eaRegBase = EA_REG_EAX; + break; + case 8: + insn->regBase = MODRM_REG_RAX; + insn->eaRegBase = EA_REG_RAX; + break; + } + + reg |= rFromREX(insn->rexPrefix) << 3; + rm |= bFromREX(insn->rexPrefix) << 3; + if (insn->vectorExtensionType == TYPE_EVEX) { + reg |= r2FromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4; + rm |= xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4; + } + + insn->reg = (Reg)(insn->regBase + reg); + + switch (insn->addressSize) { + case 2: + insn->eaBaseBase = EA_BASE_BX_SI; + + switch (mod) { + case 0x0: + if (rm == 0x6) { + insn->eaBase = EA_BASE_NONE; + insn->eaDisplacement = EA_DISP_16; + if (readDisplacement(insn)) + return -1; + } else { + insn->eaBase = (EABase)(insn->eaBaseBase + rm); + insn->eaDisplacement = EA_DISP_NONE; + } + break; + case 0x1: + insn->eaBase = (EABase)(insn->eaBaseBase + rm); + insn->eaDisplacement = EA_DISP_8; + insn->displacementSize = 1; + if (readDisplacement(insn)) + return -1; + break; + case 0x2: + insn->eaBase = (EABase)(insn->eaBaseBase + rm); + insn->eaDisplacement = EA_DISP_16; + if (readDisplacement(insn)) + return -1; + break; + case 0x3: + insn->eaBase = (EABase)(insn->eaRegBase + rm); + if (readDisplacement(insn)) + return -1; + break; + } + break; + case 4: + case 8: + insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX); + + switch (mod) { + case 0x0: + insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */ + // In determining whether RIP-relative mode is used (rm=5), + // or whether a SIB byte is present (rm=4), + // the extension bits (REX.b and EVEX.x) are ignored. + switch (rm & 7) { + case 0x4: // SIB byte is present + insn->eaBase = (insn->addressSize == 4 ? + EA_BASE_sib : EA_BASE_sib64); + if (readSIB(insn) || readDisplacement(insn)) + return -1; + break; + case 0x5: // RIP-relative + insn->eaBase = EA_BASE_NONE; + insn->eaDisplacement = EA_DISP_32; + if (readDisplacement(insn)) + return -1; + break; + default: + insn->eaBase = (EABase)(insn->eaBaseBase + rm); + break; + } + break; + case 0x1: + insn->displacementSize = 1; + /* FALLTHROUGH */ + case 0x2: + insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32); + switch (rm & 7) { + case 0x4: // SIB byte is present + insn->eaBase = EA_BASE_sib; + if (readSIB(insn) || readDisplacement(insn)) + return -1; + break; + default: + insn->eaBase = (EABase)(insn->eaBaseBase + rm); + if (readDisplacement(insn)) + return -1; + break; + } + break; + case 0x3: + insn->eaDisplacement = EA_DISP_NONE; + insn->eaBase = (EABase)(insn->eaRegBase + rm); + break; + } + break; + } /* switch (insn->addressSize) */ + + return 0; +} + +#define GENERIC_FIXUP_FUNC(name, base, prefix) \ + static uint8_t name(struct InternalInstruction *insn, \ + OperandType type, \ + uint8_t index, \ + uint8_t *valid) { \ + *valid = 1; \ + switch (type) { \ + default: \ + debug("Unhandled register type"); \ + *valid = 0; \ + return 0; \ + case TYPE_Rv: \ + return base + index; \ + case TYPE_R8: \ + if (insn->rexPrefix && \ + index >= 4 && index <= 7) { \ + return prefix##_SPL + (index - 4); \ + } else { \ + return prefix##_AL + index; \ + } \ + case TYPE_R16: \ + return prefix##_AX + index; \ + case TYPE_R32: \ + return prefix##_EAX + index; \ + case TYPE_R64: \ + return prefix##_RAX + index; \ + case TYPE_XMM512: \ + return prefix##_ZMM0 + index; \ + case TYPE_XMM256: \ + return prefix##_YMM0 + index; \ + case TYPE_XMM128: \ + case TYPE_XMM64: \ + case TYPE_XMM32: \ + case TYPE_XMM: \ + return prefix##_XMM0 + index; \ + case TYPE_VK1: \ + case TYPE_VK2: \ + case TYPE_VK4: \ + case TYPE_VK8: \ + case TYPE_VK16: \ + case TYPE_VK32: \ + case TYPE_VK64: \ + if (index > 7) \ + *valid = 0; \ + return prefix##_K0 + index; \ + case TYPE_MM64: \ + return prefix##_MM0 + (index & 0x7); \ + case TYPE_SEGMENTREG: \ + if (index > 5) \ + *valid = 0; \ + return prefix##_ES + index; \ + case TYPE_DEBUGREG: \ + return prefix##_DR0 + index; \ + case TYPE_CONTROLREG: \ + return prefix##_CR0 + index; \ + } \ + } + +/* + * fixup*Value - Consults an operand type to determine the meaning of the + * reg or R/M field. If the operand is an XMM operand, for example, an + * operand would be XMM0 instead of AX, which readModRM() would otherwise + * misinterpret it as. + * + * @param insn - The instruction containing the operand. + * @param type - The operand type. + * @param index - The existing value of the field as reported by readModRM(). + * @param valid - The address of a uint8_t. The target is set to 1 if the + * field is valid for the register class; 0 if not. + * @return - The proper value. + */ +GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase, MODRM_REG) +GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG) + +/* + * fixupReg - Consults an operand specifier to determine which of the + * fixup*Value functions to use in correcting readModRM()'ss interpretation. + * + * @param insn - See fixup*Value(). + * @param op - The operand specifier. + * @return - 0 if fixup was successful; -1 if the register returned was + * invalid for its class. + */ +static int fixupReg(struct InternalInstruction *insn, + const struct OperandSpecifier *op) { + uint8_t valid; + + dbgprintf(insn, "fixupReg()"); + + switch ((OperandEncoding)op->encoding) { + default: + debug("Expected a REG or R/M encoding in fixupReg"); + return -1; + case ENCODING_VVVV: + insn->vvvv = (Reg)fixupRegValue(insn, + (OperandType)op->type, + insn->vvvv, + &valid); + if (!valid) + return -1; + break; + case ENCODING_REG: + insn->reg = (Reg)fixupRegValue(insn, + (OperandType)op->type, + insn->reg - insn->regBase, + &valid); + if (!valid) + return -1; + break; + CASE_ENCODING_RM: + if (insn->eaBase >= insn->eaRegBase) { + insn->eaBase = (EABase)fixupRMValue(insn, + (OperandType)op->type, + insn->eaBase - insn->eaRegBase, + &valid); + if (!valid) + return -1; + } + break; + } + + return 0; +} + +/* + * readOpcodeRegister - Reads an operand from the opcode field of an + * instruction and interprets it appropriately given the operand width. + * Handles AddRegFrm instructions. + * + * @param insn - the instruction whose opcode field is to be read. + * @param size - The width (in bytes) of the register being specified. + * 1 means AL and friends, 2 means AX, 4 means EAX, and 8 means + * RAX. + * @return - 0 on success; nonzero otherwise. + */ +static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) { + dbgprintf(insn, "readOpcodeRegister()"); + + if (size == 0) + size = insn->registerSize; + + switch (size) { + case 1: + insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3) + | (insn->opcode & 7))); + if (insn->rexPrefix && + insn->opcodeRegister >= MODRM_REG_AL + 0x4 && + insn->opcodeRegister < MODRM_REG_AL + 0x8) { + insn->opcodeRegister = (Reg)(MODRM_REG_SPL + + (insn->opcodeRegister - MODRM_REG_AL - 4)); + } + + break; + case 2: + insn->opcodeRegister = (Reg)(MODRM_REG_AX + + ((bFromREX(insn->rexPrefix) << 3) + | (insn->opcode & 7))); + break; + case 4: + insn->opcodeRegister = (Reg)(MODRM_REG_EAX + + ((bFromREX(insn->rexPrefix) << 3) + | (insn->opcode & 7))); + break; + case 8: + insn->opcodeRegister = (Reg)(MODRM_REG_RAX + + ((bFromREX(insn->rexPrefix) << 3) + | (insn->opcode & 7))); + break; + } + + return 0; +} + +/* + * readImmediate - Consumes an immediate operand from an instruction, given the + * desired operand size. + * + * @param insn - The instruction whose operand is to be read. + * @param size - The width (in bytes) of the operand. + * @return - 0 if the immediate was successfully consumed; nonzero + * otherwise. + */ +static int readImmediate(struct InternalInstruction* insn, uint8_t size) { + uint8_t imm8; + uint16_t imm16; + uint32_t imm32; + uint64_t imm64; + + dbgprintf(insn, "readImmediate()"); + + if (insn->numImmediatesConsumed == 2) { + debug("Already consumed two immediates"); + return -1; + } + + if (size == 0) + size = insn->immediateSize; + else + insn->immediateSize = size; + insn->immediateOffset = insn->readerCursor - insn->startLocation; + + switch (size) { + case 1: + if (consumeByte(insn, &imm8)) + return -1; + insn->immediates[insn->numImmediatesConsumed] = imm8; + break; + case 2: + if (consumeUInt16(insn, &imm16)) + return -1; + insn->immediates[insn->numImmediatesConsumed] = imm16; + break; + case 4: + if (consumeUInt32(insn, &imm32)) + return -1; + insn->immediates[insn->numImmediatesConsumed] = imm32; + break; + case 8: + if (consumeUInt64(insn, &imm64)) + return -1; + insn->immediates[insn->numImmediatesConsumed] = imm64; + break; + } + + insn->numImmediatesConsumed++; + + return 0; +} + +/* + * readVVVV - Consumes vvvv from an instruction if it has a VEX prefix. + * + * @param insn - The instruction whose operand is to be read. + * @return - 0 if the vvvv was successfully consumed; nonzero + * otherwise. + */ +static int readVVVV(struct InternalInstruction* insn) { + dbgprintf(insn, "readVVVV()"); + + int vvvv; + if (insn->vectorExtensionType == TYPE_EVEX) + vvvv = (v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4 | + vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2])); + else if (insn->vectorExtensionType == TYPE_VEX_3B) + vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]); + else if (insn->vectorExtensionType == TYPE_VEX_2B) + vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]); + else if (insn->vectorExtensionType == TYPE_XOP) + vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]); + else + return -1; + + if (insn->mode != MODE_64BIT) + vvvv &= 0x7; + + insn->vvvv = static_cast<Reg>(vvvv); + return 0; +} + +/* + * readMaskRegister - Reads an mask register from the opcode field of an + * instruction. + * + * @param insn - The instruction whose opcode field is to be read. + * @return - 0 on success; nonzero otherwise. + */ +static int readMaskRegister(struct InternalInstruction* insn) { + dbgprintf(insn, "readMaskRegister()"); + + if (insn->vectorExtensionType != TYPE_EVEX) + return -1; + + insn->writemask = + static_cast<Reg>(aaaFromEVEX4of4(insn->vectorExtensionPrefix[3])); + return 0; +} + +/* + * readOperands - Consults the specifier for an instruction and consumes all + * operands for that instruction, interpreting them as it goes. + * + * @param insn - The instruction whose operands are to be read and interpreted. + * @return - 0 if all operands could be read; nonzero otherwise. + */ +static int readOperands(struct InternalInstruction* insn) { + int hasVVVV, needVVVV; + int sawRegImm = 0; + + dbgprintf(insn, "readOperands()"); + + /* If non-zero vvvv specified, need to make sure one of the operands + uses it. */ + hasVVVV = !readVVVV(insn); + needVVVV = hasVVVV && (insn->vvvv != 0); + + for (const auto &Op : x86OperandSets[insn->spec->operands]) { + switch (Op.encoding) { + case ENCODING_NONE: + case ENCODING_SI: + case ENCODING_DI: + break; + case ENCODING_REG: + CASE_ENCODING_RM: + if (readModRM(insn)) + return -1; + if (fixupReg(insn, &Op)) + return -1; + // Apply the AVX512 compressed displacement scaling factor. + if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8) + insn->displacement *= 1 << (Op.encoding - ENCODING_RM); + break; + case ENCODING_CB: + case ENCODING_CW: + case ENCODING_CD: + case ENCODING_CP: + case ENCODING_CO: + case ENCODING_CT: + dbgprintf(insn, "We currently don't hande code-offset encodings"); + return -1; + case ENCODING_IB: + if (sawRegImm) { + /* Saw a register immediate so don't read again and instead split the + previous immediate. FIXME: This is a hack. */ + insn->immediates[insn->numImmediatesConsumed] = + insn->immediates[insn->numImmediatesConsumed - 1] & 0xf; + ++insn->numImmediatesConsumed; + break; + } + if (readImmediate(insn, 1)) + return -1; + if (Op.type == TYPE_XMM128 || + Op.type == TYPE_XMM256) + sawRegImm = 1; + break; + case ENCODING_IW: + if (readImmediate(insn, 2)) + return -1; + break; + case ENCODING_ID: + if (readImmediate(insn, 4)) + return -1; + break; + case ENCODING_IO: + if (readImmediate(insn, 8)) + return -1; + break; + case ENCODING_Iv: + if (readImmediate(insn, insn->immediateSize)) + return -1; + break; + case ENCODING_Ia: + if (readImmediate(insn, insn->addressSize)) + return -1; + break; + case ENCODING_RB: + if (readOpcodeRegister(insn, 1)) + return -1; + break; + case ENCODING_RW: + if (readOpcodeRegister(insn, 2)) + return -1; + break; + case ENCODING_RD: + if (readOpcodeRegister(insn, 4)) + return -1; + break; + case ENCODING_RO: + if (readOpcodeRegister(insn, 8)) + return -1; + break; + case ENCODING_Rv: + if (readOpcodeRegister(insn, 0)) + return -1; + break; + case ENCODING_FP: + break; + case ENCODING_VVVV: + needVVVV = 0; /* Mark that we have found a VVVV operand. */ + if (!hasVVVV) + return -1; + if (fixupReg(insn, &Op)) + return -1; + break; + case ENCODING_WRITEMASK: + if (readMaskRegister(insn)) + return -1; + break; + case ENCODING_DUP: + break; + default: + dbgprintf(insn, "Encountered an operand with an unknown encoding."); + return -1; + } + } + + /* If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail */ + if (needVVVV) return -1; + + return 0; +} + +/* + * decodeInstruction - Reads and interprets a full instruction provided by the + * user. + * + * @param insn - A pointer to the instruction to be populated. Must be + * pre-allocated. + * @param reader - The function to be used to read the instruction's bytes. + * @param readerArg - A generic argument to be passed to the reader to store + * any internal state. + * @param logger - If non-NULL, the function to be used to write log messages + * and warnings. + * @param loggerArg - A generic argument to be passed to the logger to store + * any internal state. + * @param startLoc - The address (in the reader's address space) of the first + * byte in the instruction. + * @param mode - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to + * decode the instruction in. + * @return - 0 if the instruction's memory could be read; nonzero if + * not. + */ +int llvm::X86Disassembler::decodeInstruction( + struct InternalInstruction *insn, byteReader_t reader, + const void *readerArg, dlog_t logger, void *loggerArg, const void *miiArg, + uint64_t startLoc, DisassemblerMode mode) { + memset(insn, 0, sizeof(struct InternalInstruction)); + + insn->reader = reader; + insn->readerArg = readerArg; + insn->dlog = logger; + insn->dlogArg = loggerArg; + insn->startLocation = startLoc; + insn->readerCursor = startLoc; + insn->mode = mode; + insn->numImmediatesConsumed = 0; + + if (readPrefixes(insn) || + readOpcode(insn) || + getID(insn, miiArg) || + insn->instructionID == 0 || + readOperands(insn)) + return -1; + + insn->operands = x86OperandSets[insn->spec->operands]; + + insn->length = insn->readerCursor - insn->startLocation; + + dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu", + startLoc, insn->readerCursor, insn->length); + + if (insn->length > 15) + dbgprintf(insn, "Instruction exceeds 15-byte limit"); + + return 0; +} diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h new file mode 100644 index 0000000..28a628e --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -0,0 +1,675 @@ +//===-- X86DisassemblerDecoderInternal.h - Disassembler decoder -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is part of the X86 Disassembler. +// It contains the public interface of the instruction decoder. +// Documentation for the disassembler can be found in X86Disassembler.h. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H +#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H + +#include "X86DisassemblerDecoderCommon.h" +#include "llvm/ADT/ArrayRef.h" + +namespace llvm { +namespace X86Disassembler { + +// Accessor functions for various fields of an Intel instruction +#define modFromModRM(modRM) (((modRM) & 0xc0) >> 6) +#define regFromModRM(modRM) (((modRM) & 0x38) >> 3) +#define rmFromModRM(modRM) ((modRM) & 0x7) +#define scaleFromSIB(sib) (((sib) & 0xc0) >> 6) +#define indexFromSIB(sib) (((sib) & 0x38) >> 3) +#define baseFromSIB(sib) ((sib) & 0x7) +#define wFromREX(rex) (((rex) & 0x8) >> 3) +#define rFromREX(rex) (((rex) & 0x4) >> 2) +#define xFromREX(rex) (((rex) & 0x2) >> 1) +#define bFromREX(rex) ((rex) & 0x1) + +#define rFromEVEX2of4(evex) (((~(evex)) & 0x80) >> 7) +#define xFromEVEX2of4(evex) (((~(evex)) & 0x40) >> 6) +#define bFromEVEX2of4(evex) (((~(evex)) & 0x20) >> 5) +#define r2FromEVEX2of4(evex) (((~(evex)) & 0x10) >> 4) +#define mmFromEVEX2of4(evex) ((evex) & 0x3) +#define wFromEVEX3of4(evex) (((evex) & 0x80) >> 7) +#define vvvvFromEVEX3of4(evex) (((~(evex)) & 0x78) >> 3) +#define ppFromEVEX3of4(evex) ((evex) & 0x3) +#define zFromEVEX4of4(evex) (((evex) & 0x80) >> 7) +#define l2FromEVEX4of4(evex) (((evex) & 0x40) >> 6) +#define lFromEVEX4of4(evex) (((evex) & 0x20) >> 5) +#define bFromEVEX4of4(evex) (((evex) & 0x10) >> 4) +#define v2FromEVEX4of4(evex) (((~evex) & 0x8) >> 3) +#define aaaFromEVEX4of4(evex) ((evex) & 0x7) + +#define rFromVEX2of3(vex) (((~(vex)) & 0x80) >> 7) +#define xFromVEX2of3(vex) (((~(vex)) & 0x40) >> 6) +#define bFromVEX2of3(vex) (((~(vex)) & 0x20) >> 5) +#define mmmmmFromVEX2of3(vex) ((vex) & 0x1f) +#define wFromVEX3of3(vex) (((vex) & 0x80) >> 7) +#define vvvvFromVEX3of3(vex) (((~(vex)) & 0x78) >> 3) +#define lFromVEX3of3(vex) (((vex) & 0x4) >> 2) +#define ppFromVEX3of3(vex) ((vex) & 0x3) + +#define rFromVEX2of2(vex) (((~(vex)) & 0x80) >> 7) +#define vvvvFromVEX2of2(vex) (((~(vex)) & 0x78) >> 3) +#define lFromVEX2of2(vex) (((vex) & 0x4) >> 2) +#define ppFromVEX2of2(vex) ((vex) & 0x3) + +#define rFromXOP2of3(xop) (((~(xop)) & 0x80) >> 7) +#define xFromXOP2of3(xop) (((~(xop)) & 0x40) >> 6) +#define bFromXOP2of3(xop) (((~(xop)) & 0x20) >> 5) +#define mmmmmFromXOP2of3(xop) ((xop) & 0x1f) +#define wFromXOP3of3(xop) (((xop) & 0x80) >> 7) +#define vvvvFromXOP3of3(vex) (((~(vex)) & 0x78) >> 3) +#define lFromXOP3of3(xop) (((xop) & 0x4) >> 2) +#define ppFromXOP3of3(xop) ((xop) & 0x3) + +// These enums represent Intel registers for use by the decoder. +#define REGS_8BIT \ + ENTRY(AL) \ + ENTRY(CL) \ + ENTRY(DL) \ + ENTRY(BL) \ + ENTRY(AH) \ + ENTRY(CH) \ + ENTRY(DH) \ + ENTRY(BH) \ + ENTRY(R8B) \ + ENTRY(R9B) \ + ENTRY(R10B) \ + ENTRY(R11B) \ + ENTRY(R12B) \ + ENTRY(R13B) \ + ENTRY(R14B) \ + ENTRY(R15B) \ + ENTRY(SPL) \ + ENTRY(BPL) \ + ENTRY(SIL) \ + ENTRY(DIL) + +#define EA_BASES_16BIT \ + ENTRY(BX_SI) \ + ENTRY(BX_DI) \ + ENTRY(BP_SI) \ + ENTRY(BP_DI) \ + ENTRY(SI) \ + ENTRY(DI) \ + ENTRY(BP) \ + ENTRY(BX) \ + ENTRY(R8W) \ + ENTRY(R9W) \ + ENTRY(R10W) \ + ENTRY(R11W) \ + ENTRY(R12W) \ + ENTRY(R13W) \ + ENTRY(R14W) \ + ENTRY(R15W) + +#define REGS_16BIT \ + ENTRY(AX) \ + ENTRY(CX) \ + ENTRY(DX) \ + ENTRY(BX) \ + ENTRY(SP) \ + ENTRY(BP) \ + ENTRY(SI) \ + ENTRY(DI) \ + ENTRY(R8W) \ + ENTRY(R9W) \ + ENTRY(R10W) \ + ENTRY(R11W) \ + ENTRY(R12W) \ + ENTRY(R13W) \ + ENTRY(R14W) \ + ENTRY(R15W) + +#define EA_BASES_32BIT \ + ENTRY(EAX) \ + ENTRY(ECX) \ + ENTRY(EDX) \ + ENTRY(EBX) \ + ENTRY(sib) \ + ENTRY(EBP) \ + ENTRY(ESI) \ + ENTRY(EDI) \ + ENTRY(R8D) \ + ENTRY(R9D) \ + ENTRY(R10D) \ + ENTRY(R11D) \ + ENTRY(R12D) \ + ENTRY(R13D) \ + ENTRY(R14D) \ + ENTRY(R15D) + +#define REGS_32BIT \ + ENTRY(EAX) \ + ENTRY(ECX) \ + ENTRY(EDX) \ + ENTRY(EBX) \ + ENTRY(ESP) \ + ENTRY(EBP) \ + ENTRY(ESI) \ + ENTRY(EDI) \ + ENTRY(R8D) \ + ENTRY(R9D) \ + ENTRY(R10D) \ + ENTRY(R11D) \ + ENTRY(R12D) \ + ENTRY(R13D) \ + ENTRY(R14D) \ + ENTRY(R15D) + +#define EA_BASES_64BIT \ + ENTRY(RAX) \ + ENTRY(RCX) \ + ENTRY(RDX) \ + ENTRY(RBX) \ + ENTRY(sib64) \ + ENTRY(RBP) \ + ENTRY(RSI) \ + ENTRY(RDI) \ + ENTRY(R8) \ + ENTRY(R9) \ + ENTRY(R10) \ + ENTRY(R11) \ + ENTRY(R12) \ + ENTRY(R13) \ + ENTRY(R14) \ + ENTRY(R15) + +#define REGS_64BIT \ + ENTRY(RAX) \ + ENTRY(RCX) \ + ENTRY(RDX) \ + ENTRY(RBX) \ + ENTRY(RSP) \ + ENTRY(RBP) \ + ENTRY(RSI) \ + ENTRY(RDI) \ + ENTRY(R8) \ + ENTRY(R9) \ + ENTRY(R10) \ + ENTRY(R11) \ + ENTRY(R12) \ + ENTRY(R13) \ + ENTRY(R14) \ + ENTRY(R15) + +#define REGS_MMX \ + ENTRY(MM0) \ + ENTRY(MM1) \ + ENTRY(MM2) \ + ENTRY(MM3) \ + ENTRY(MM4) \ + ENTRY(MM5) \ + ENTRY(MM6) \ + ENTRY(MM7) + +#define REGS_XMM \ + ENTRY(XMM0) \ + ENTRY(XMM1) \ + ENTRY(XMM2) \ + ENTRY(XMM3) \ + ENTRY(XMM4) \ + ENTRY(XMM5) \ + ENTRY(XMM6) \ + ENTRY(XMM7) \ + ENTRY(XMM8) \ + ENTRY(XMM9) \ + ENTRY(XMM10) \ + ENTRY(XMM11) \ + ENTRY(XMM12) \ + ENTRY(XMM13) \ + ENTRY(XMM14) \ + ENTRY(XMM15) \ + ENTRY(XMM16) \ + ENTRY(XMM17) \ + ENTRY(XMM18) \ + ENTRY(XMM19) \ + ENTRY(XMM20) \ + ENTRY(XMM21) \ + ENTRY(XMM22) \ + ENTRY(XMM23) \ + ENTRY(XMM24) \ + ENTRY(XMM25) \ + ENTRY(XMM26) \ + ENTRY(XMM27) \ + ENTRY(XMM28) \ + ENTRY(XMM29) \ + ENTRY(XMM30) \ + ENTRY(XMM31) + +#define REGS_YMM \ + ENTRY(YMM0) \ + ENTRY(YMM1) \ + ENTRY(YMM2) \ + ENTRY(YMM3) \ + ENTRY(YMM4) \ + ENTRY(YMM5) \ + ENTRY(YMM6) \ + ENTRY(YMM7) \ + ENTRY(YMM8) \ + ENTRY(YMM9) \ + ENTRY(YMM10) \ + ENTRY(YMM11) \ + ENTRY(YMM12) \ + ENTRY(YMM13) \ + ENTRY(YMM14) \ + ENTRY(YMM15) \ + ENTRY(YMM16) \ + ENTRY(YMM17) \ + ENTRY(YMM18) \ + ENTRY(YMM19) \ + ENTRY(YMM20) \ + ENTRY(YMM21) \ + ENTRY(YMM22) \ + ENTRY(YMM23) \ + ENTRY(YMM24) \ + ENTRY(YMM25) \ + ENTRY(YMM26) \ + ENTRY(YMM27) \ + ENTRY(YMM28) \ + ENTRY(YMM29) \ + ENTRY(YMM30) \ + ENTRY(YMM31) + +#define REGS_ZMM \ + ENTRY(ZMM0) \ + ENTRY(ZMM1) \ + ENTRY(ZMM2) \ + ENTRY(ZMM3) \ + ENTRY(ZMM4) \ + ENTRY(ZMM5) \ + ENTRY(ZMM6) \ + ENTRY(ZMM7) \ + ENTRY(ZMM8) \ + ENTRY(ZMM9) \ + ENTRY(ZMM10) \ + ENTRY(ZMM11) \ + ENTRY(ZMM12) \ + ENTRY(ZMM13) \ + ENTRY(ZMM14) \ + ENTRY(ZMM15) \ + ENTRY(ZMM16) \ + ENTRY(ZMM17) \ + ENTRY(ZMM18) \ + ENTRY(ZMM19) \ + ENTRY(ZMM20) \ + ENTRY(ZMM21) \ + ENTRY(ZMM22) \ + ENTRY(ZMM23) \ + ENTRY(ZMM24) \ + ENTRY(ZMM25) \ + ENTRY(ZMM26) \ + ENTRY(ZMM27) \ + ENTRY(ZMM28) \ + ENTRY(ZMM29) \ + ENTRY(ZMM30) \ + ENTRY(ZMM31) + +#define REGS_MASKS \ + ENTRY(K0) \ + ENTRY(K1) \ + ENTRY(K2) \ + ENTRY(K3) \ + ENTRY(K4) \ + ENTRY(K5) \ + ENTRY(K6) \ + ENTRY(K7) + +#define REGS_SEGMENT \ + ENTRY(ES) \ + ENTRY(CS) \ + ENTRY(SS) \ + ENTRY(DS) \ + ENTRY(FS) \ + ENTRY(GS) + +#define REGS_DEBUG \ + ENTRY(DR0) \ + ENTRY(DR1) \ + ENTRY(DR2) \ + ENTRY(DR3) \ + ENTRY(DR4) \ + ENTRY(DR5) \ + ENTRY(DR6) \ + ENTRY(DR7) \ + ENTRY(DR8) \ + ENTRY(DR9) \ + ENTRY(DR10) \ + ENTRY(DR11) \ + ENTRY(DR12) \ + ENTRY(DR13) \ + ENTRY(DR14) \ + ENTRY(DR15) + +#define REGS_CONTROL \ + ENTRY(CR0) \ + ENTRY(CR1) \ + ENTRY(CR2) \ + ENTRY(CR3) \ + ENTRY(CR4) \ + ENTRY(CR5) \ + ENTRY(CR6) \ + ENTRY(CR7) \ + ENTRY(CR8) \ + ENTRY(CR9) \ + ENTRY(CR10) \ + ENTRY(CR11) \ + ENTRY(CR12) \ + ENTRY(CR13) \ + ENTRY(CR14) \ + ENTRY(CR15) + +#define ALL_EA_BASES \ + EA_BASES_16BIT \ + EA_BASES_32BIT \ + EA_BASES_64BIT + +#define ALL_SIB_BASES \ + REGS_32BIT \ + REGS_64BIT + +#define ALL_REGS \ + REGS_8BIT \ + REGS_16BIT \ + REGS_32BIT \ + REGS_64BIT \ + REGS_MMX \ + REGS_XMM \ + REGS_YMM \ + REGS_ZMM \ + REGS_MASKS \ + REGS_SEGMENT \ + REGS_DEBUG \ + REGS_CONTROL \ + ENTRY(RIP) + +/// \brief All possible values of the base field for effective-address +/// computations, a.k.a. the Mod and R/M fields of the ModR/M byte. +/// We distinguish between bases (EA_BASE_*) and registers that just happen +/// to be referred to when Mod == 0b11 (EA_REG_*). +enum EABase { + EA_BASE_NONE, +#define ENTRY(x) EA_BASE_##x, + ALL_EA_BASES +#undef ENTRY +#define ENTRY(x) EA_REG_##x, + ALL_REGS +#undef ENTRY + EA_max +}; + +/// \brief All possible values of the SIB index field. +/// borrows entries from ALL_EA_BASES with the special case that +/// sib is synonymous with NONE. +/// Vector SIB: index can be XMM or YMM. +enum SIBIndex { + SIB_INDEX_NONE, +#define ENTRY(x) SIB_INDEX_##x, + ALL_EA_BASES + REGS_XMM + REGS_YMM + REGS_ZMM +#undef ENTRY + SIB_INDEX_max +}; + +/// \brief All possible values of the SIB base field. +enum SIBBase { + SIB_BASE_NONE, +#define ENTRY(x) SIB_BASE_##x, + ALL_SIB_BASES +#undef ENTRY + SIB_BASE_max +}; + +/// \brief Possible displacement types for effective-address computations. +typedef enum { + EA_DISP_NONE, + EA_DISP_8, + EA_DISP_16, + EA_DISP_32 +} EADisplacement; + +/// \brief All possible values of the reg field in the ModR/M byte. +enum Reg { +#define ENTRY(x) MODRM_REG_##x, + ALL_REGS +#undef ENTRY + MODRM_REG_max +}; + +/// \brief All possible segment overrides. +enum SegmentOverride { + SEG_OVERRIDE_NONE, + SEG_OVERRIDE_CS, + SEG_OVERRIDE_SS, + SEG_OVERRIDE_DS, + SEG_OVERRIDE_ES, + SEG_OVERRIDE_FS, + SEG_OVERRIDE_GS, + SEG_OVERRIDE_max +}; + +/// \brief Possible values for the VEX.m-mmmm field +enum VEXLeadingOpcodeByte { + VEX_LOB_0F = 0x1, + VEX_LOB_0F38 = 0x2, + VEX_LOB_0F3A = 0x3 +}; + +enum XOPMapSelect { + XOP_MAP_SELECT_8 = 0x8, + XOP_MAP_SELECT_9 = 0x9, + XOP_MAP_SELECT_A = 0xA +}; + +/// \brief Possible values for the VEX.pp/EVEX.pp field +enum VEXPrefixCode { + VEX_PREFIX_NONE = 0x0, + VEX_PREFIX_66 = 0x1, + VEX_PREFIX_F3 = 0x2, + VEX_PREFIX_F2 = 0x3 +}; + +enum VectorExtensionType { + TYPE_NO_VEX_XOP = 0x0, + TYPE_VEX_2B = 0x1, + TYPE_VEX_3B = 0x2, + TYPE_EVEX = 0x3, + TYPE_XOP = 0x4 +}; + +/// \brief Type for the byte reader that the consumer must provide to +/// the decoder. Reads a single byte from the instruction's address space. +/// \param arg A baton that the consumer can associate with any internal +/// state that it needs. +/// \param byte A pointer to a single byte in memory that should be set to +/// contain the value at address. +/// \param address The address in the instruction's address space that should +/// be read from. +/// \return -1 if the byte cannot be read for any reason; 0 otherwise. +typedef int (*byteReader_t)(const void *arg, uint8_t *byte, uint64_t address); + +/// \brief Type for the logging function that the consumer can provide to +/// get debugging output from the decoder. +/// \param arg A baton that the consumer can associate with any internal +/// state that it needs. +/// \param log A string that contains the message. Will be reused after +/// the logger returns. +typedef void (*dlog_t)(void *arg, const char *log); + +/// The specification for how to extract and interpret a full instruction and +/// its operands. +struct InstructionSpecifier { + uint16_t operands; +}; + +/// The x86 internal instruction, which is produced by the decoder. +struct InternalInstruction { + // Reader interface (C) + byteReader_t reader; + // Opaque value passed to the reader + const void* readerArg; + // The address of the next byte to read via the reader + uint64_t readerCursor; + + // Logger interface (C) + dlog_t dlog; + // Opaque value passed to the logger + void* dlogArg; + + // General instruction information + + // The mode to disassemble for (64-bit, protected, real) + DisassemblerMode mode; + // The start of the instruction, usable with the reader + uint64_t startLocation; + // The length of the instruction, in bytes + size_t length; + + // Prefix state + + // 1 if the prefix byte corresponding to the entry is present; 0 if not + uint8_t prefixPresent[0x100]; + // contains the location (for use with the reader) of the prefix byte + uint64_t prefixLocations[0x100]; + // The value of the vector extension prefix(EVEX/VEX/XOP), if present + uint8_t vectorExtensionPrefix[4]; + // The type of the vector extension prefix + VectorExtensionType vectorExtensionType; + // The value of the REX prefix, if present + uint8_t rexPrefix; + // The location where a mandatory prefix would have to be (i.e., right before + // the opcode, or right before the REX prefix if one is present). + uint64_t necessaryPrefixLocation; + // The segment override type + SegmentOverride segmentOverride; + // 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease + bool xAcquireRelease; + + // Sizes of various critical pieces of data, in bytes + uint8_t registerSize; + uint8_t addressSize; + uint8_t displacementSize; + uint8_t immediateSize; + + // Offsets from the start of the instruction to the pieces of data, which is + // needed to find relocation entries for adding symbolic operands. + uint8_t displacementOffset; + uint8_t immediateOffset; + + // opcode state + + // The last byte of the opcode, not counting any ModR/M extension + uint8_t opcode; + + // decode state + + // The type of opcode, used for indexing into the array of decode tables + OpcodeType opcodeType; + // The instruction ID, extracted from the decode table + uint16_t instructionID; + // The specifier for the instruction, from the instruction info table + const InstructionSpecifier *spec; + + // state for additional bytes, consumed during operand decode. Pattern: + // consumed___ indicates that the byte was already consumed and does not + // need to be consumed again. + + // The VEX.vvvv field, which contains a third register operand for some AVX + // instructions. + Reg vvvv; + + // The writemask for AVX-512 instructions which is contained in EVEX.aaa + Reg writemask; + + // The ModR/M byte, which contains most register operands and some portion of + // all memory operands. + bool consumedModRM; + uint8_t modRM; + + // The SIB byte, used for more complex 32- or 64-bit memory operands + bool consumedSIB; + uint8_t sib; + + // The displacement, used for memory operands + bool consumedDisplacement; + int32_t displacement; + + // Immediates. There can be two in some cases + uint8_t numImmediatesConsumed; + uint8_t numImmediatesTranslated; + uint64_t immediates[2]; + + // A register or immediate operand encoded into the opcode + Reg opcodeRegister; + + // Portions of the ModR/M byte + + // These fields determine the allowable values for the ModR/M fields, which + // depend on operand and address widths. + EABase eaBaseBase; + EABase eaRegBase; + Reg regBase; + + // The Mod and R/M fields can encode a base for an effective address, or a + // register. These are separated into two fields here. + EABase eaBase; + EADisplacement eaDisplacement; + // The reg field always encodes a register + Reg reg; + + // SIB state + SIBIndex sibIndex; + uint8_t sibScale; + SIBBase sibBase; + + ArrayRef<OperandSpecifier> operands; +}; + +/// \brief Decode one instruction and store the decoding results in +/// a buffer provided by the consumer. +/// \param insn The buffer to store the instruction in. Allocated by the +/// consumer. +/// \param reader The byteReader_t for the bytes to be read. +/// \param readerArg An argument to pass to the reader for storing context +/// specific to the consumer. May be NULL. +/// \param logger The dlog_t to be used in printing status messages from the +/// disassembler. May be NULL. +/// \param loggerArg An argument to pass to the logger for storing context +/// specific to the logger. May be NULL. +/// \param startLoc The address (in the reader's address space) of the first +/// byte in the instruction. +/// \param mode The mode (16-bit, 32-bit, 64-bit) to decode in. +/// \return Nonzero if there was an error during decode, 0 otherwise. +int decodeInstruction(InternalInstruction *insn, + byteReader_t reader, + const void *readerArg, + dlog_t logger, + void *loggerArg, + const void *miiArg, + uint64_t startLoc, + DisassemblerMode mode); + +/// \brief Print a message to debugs() +/// \param file The name of the file printing the debug message. +/// \param line The line number that printed the debug message. +/// \param s The message to print. +void Debug(const char *file, unsigned line, const char *s); + +const char *GetInstrName(unsigned Opcode, const void *mii); + +} // namespace X86Disassembler +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h new file mode 100644 index 0000000..301db72 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -0,0 +1,503 @@ +//===-- X86DisassemblerDecoderCommon.h - Disassembler decoder ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is part of the X86 Disassembler. +// It contains common definitions used by both the disassembler and the table +// generator. +// Documentation for the disassembler can be found in X86Disassembler.h. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODERCOMMON_H +#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODERCOMMON_H + +#include "llvm/Support/DataTypes.h" + +namespace llvm { +namespace X86Disassembler { + +#define INSTRUCTIONS_SYM x86DisassemblerInstrSpecifiers +#define CONTEXTS_SYM x86DisassemblerContexts +#define ONEBYTE_SYM x86DisassemblerOneByteOpcodes +#define TWOBYTE_SYM x86DisassemblerTwoByteOpcodes +#define THREEBYTE38_SYM x86DisassemblerThreeByte38Opcodes +#define THREEBYTE3A_SYM x86DisassemblerThreeByte3AOpcodes +#define XOP8_MAP_SYM x86DisassemblerXOP8Opcodes +#define XOP9_MAP_SYM x86DisassemblerXOP9Opcodes +#define XOPA_MAP_SYM x86DisassemblerXOPAOpcodes + +#define INSTRUCTIONS_STR "x86DisassemblerInstrSpecifiers" +#define CONTEXTS_STR "x86DisassemblerContexts" +#define ONEBYTE_STR "x86DisassemblerOneByteOpcodes" +#define TWOBYTE_STR "x86DisassemblerTwoByteOpcodes" +#define THREEBYTE38_STR "x86DisassemblerThreeByte38Opcodes" +#define THREEBYTE3A_STR "x86DisassemblerThreeByte3AOpcodes" +#define XOP8_MAP_STR "x86DisassemblerXOP8Opcodes" +#define XOP9_MAP_STR "x86DisassemblerXOP9Opcodes" +#define XOPA_MAP_STR "x86DisassemblerXOPAOpcodes" + +// Attributes of an instruction that must be known before the opcode can be +// processed correctly. Most of these indicate the presence of particular +// prefixes, but ATTR_64BIT is simply an attribute of the decoding context. +#define ATTRIBUTE_BITS \ + ENUM_ENTRY(ATTR_NONE, 0x00) \ + ENUM_ENTRY(ATTR_64BIT, (0x1 << 0)) \ + ENUM_ENTRY(ATTR_XS, (0x1 << 1)) \ + ENUM_ENTRY(ATTR_XD, (0x1 << 2)) \ + ENUM_ENTRY(ATTR_REXW, (0x1 << 3)) \ + ENUM_ENTRY(ATTR_OPSIZE, (0x1 << 4)) \ + ENUM_ENTRY(ATTR_ADSIZE, (0x1 << 5)) \ + ENUM_ENTRY(ATTR_VEX, (0x1 << 6)) \ + ENUM_ENTRY(ATTR_VEXL, (0x1 << 7)) \ + ENUM_ENTRY(ATTR_EVEX, (0x1 << 8)) \ + ENUM_ENTRY(ATTR_EVEXL, (0x1 << 9)) \ + ENUM_ENTRY(ATTR_EVEXL2, (0x1 << 10)) \ + ENUM_ENTRY(ATTR_EVEXK, (0x1 << 11)) \ + ENUM_ENTRY(ATTR_EVEXKZ, (0x1 << 12)) \ + ENUM_ENTRY(ATTR_EVEXB, (0x1 << 13)) + +#define ENUM_ENTRY(n, v) n = v, +enum attributeBits { + ATTRIBUTE_BITS + ATTR_max +}; +#undef ENUM_ENTRY + +// Combinations of the above attributes that are relevant to instruction +// decode. Although other combinations are possible, they can be reduced to +// these without affecting the ultimately decoded instruction. + +// Class name Rank Rationale for rank assignment +#define INSTRUCTION_CONTEXTS \ + ENUM_ENTRY(IC, 0, "says nothing about the instruction") \ + ENUM_ENTRY(IC_64BIT, 1, "says the instruction applies in " \ + "64-bit mode but no more") \ + ENUM_ENTRY(IC_OPSIZE, 3, "requires an OPSIZE prefix, so " \ + "operands change width") \ + ENUM_ENTRY(IC_ADSIZE, 3, "requires an ADSIZE prefix, so " \ + "operands change width") \ + ENUM_ENTRY(IC_OPSIZE_ADSIZE, 4, "requires ADSIZE and OPSIZE prefixes") \ + ENUM_ENTRY(IC_XD, 2, "may say something about the opcode " \ + "but not the operands") \ + ENUM_ENTRY(IC_XS, 2, "may say something about the opcode " \ + "but not the operands") \ + ENUM_ENTRY(IC_XD_OPSIZE, 3, "requires an OPSIZE prefix, so " \ + "operands change width") \ + ENUM_ENTRY(IC_XS_OPSIZE, 3, "requires an OPSIZE prefix, so " \ + "operands change width") \ + ENUM_ENTRY(IC_64BIT_REXW, 5, "requires a REX.W prefix, so operands "\ + "change width; overrides IC_OPSIZE") \ + ENUM_ENTRY(IC_64BIT_REXW_ADSIZE, 6, "requires a REX.W prefix and 0x67 " \ + "prefix") \ + ENUM_ENTRY(IC_64BIT_OPSIZE, 3, "Just as meaningful as IC_OPSIZE") \ + ENUM_ENTRY(IC_64BIT_ADSIZE, 3, "Just as meaningful as IC_ADSIZE") \ + ENUM_ENTRY(IC_64BIT_OPSIZE_ADSIZE, 4, "Just as meaningful as IC_OPSIZE/" \ + "IC_ADSIZE") \ + ENUM_ENTRY(IC_64BIT_XD, 6, "XD instructions are SSE; REX.W is " \ + "secondary") \ + ENUM_ENTRY(IC_64BIT_XS, 6, "Just as meaningful as IC_64BIT_XD") \ + ENUM_ENTRY(IC_64BIT_XD_OPSIZE, 3, "Just as meaningful as IC_XD_OPSIZE") \ + ENUM_ENTRY(IC_64BIT_XS_OPSIZE, 3, "Just as meaningful as IC_XS_OPSIZE") \ + ENUM_ENTRY(IC_64BIT_REXW_XS, 7, "OPSIZE could mean a different " \ + "opcode") \ + ENUM_ENTRY(IC_64BIT_REXW_XD, 7, "Just as meaningful as " \ + "IC_64BIT_REXW_XS") \ + ENUM_ENTRY(IC_64BIT_REXW_OPSIZE, 8, "The Dynamic Duo! Prefer over all " \ + "else because this changes most " \ + "operands' meaning") \ + ENUM_ENTRY(IC_VEX, 1, "requires a VEX prefix") \ + ENUM_ENTRY(IC_VEX_XS, 2, "requires VEX and the XS prefix") \ + ENUM_ENTRY(IC_VEX_XD, 2, "requires VEX and the XD prefix") \ + ENUM_ENTRY(IC_VEX_OPSIZE, 2, "requires VEX and the OpSize prefix") \ + ENUM_ENTRY(IC_VEX_W, 3, "requires VEX and the W prefix") \ + ENUM_ENTRY(IC_VEX_W_XS, 4, "requires VEX, W, and XS prefix") \ + ENUM_ENTRY(IC_VEX_W_XD, 4, "requires VEX, W, and XD prefix") \ + ENUM_ENTRY(IC_VEX_W_OPSIZE, 4, "requires VEX, W, and OpSize") \ + ENUM_ENTRY(IC_VEX_L, 3, "requires VEX and the L prefix") \ + ENUM_ENTRY(IC_VEX_L_XS, 4, "requires VEX and the L and XS prefix")\ + ENUM_ENTRY(IC_VEX_L_XD, 4, "requires VEX and the L and XD prefix")\ + ENUM_ENTRY(IC_VEX_L_OPSIZE, 4, "requires VEX, L, and OpSize") \ + ENUM_ENTRY(IC_VEX_L_W, 4, "requires VEX, L and W") \ + ENUM_ENTRY(IC_VEX_L_W_XS, 5, "requires VEX, L, W and XS prefix") \ + ENUM_ENTRY(IC_VEX_L_W_XD, 5, "requires VEX, L, W and XD prefix") \ + ENUM_ENTRY(IC_VEX_L_W_OPSIZE, 5, "requires VEX, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX, 1, "requires an EVEX prefix") \ + ENUM_ENTRY(IC_EVEX_XS, 2, "requires EVEX and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD, 2, "requires EVEX and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE, 2, "requires EVEX and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W, 3, "requires EVEX and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS, 4, "requires EVEX, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD, 4, "requires EVEX, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE, 4, "requires EVEX, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L, 3, "requires EVEX and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS, 4, "requires EVEX and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD, 4, "requires EVEX and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE, 4, "requires EVEX, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W, 3, "requires EVEX, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS, 4, "requires EVEX, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD, 4, "requires EVEX, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE, 4, "requires EVEX, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2, 3, "requires EVEX and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS, 4, "requires EVEX and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD, 4, "requires EVEX and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE, 4, "requires EVEX, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W, 3, "requires EVEX, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS, 4, "requires EVEX, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD, 4, "requires EVEX, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE, 4, "requires EVEX, L2, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_K, 1, "requires an EVEX_K prefix") \ + ENUM_ENTRY(IC_EVEX_XS_K, 2, "requires EVEX_K and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD_K, 2, "requires EVEX_K and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE_K, 2, "requires EVEX_K and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W_K, 3, "requires EVEX_K and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS_K, 4, "requires EVEX_K, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD_K, 4, "requires EVEX_K, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE_K, 4, "requires EVEX_K, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_K, 3, "requires EVEX_K and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS_K, 4, "requires EVEX_K and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD_K, 4, "requires EVEX_K and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE_K, 4, "requires EVEX_K, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W_K, 3, "requires EVEX_K, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS_K, 4, "requires EVEX_K, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD_K, 4, "requires EVEX_K, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K, 4, "requires EVEX_K, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_K, 3, "requires EVEX_K and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS_K, 4, "requires EVEX_K and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD_K, 4, "requires EVEX_K and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K, 4, "requires EVEX_K, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W_K, 3, "requires EVEX_K, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS_K, 4, "requires EVEX_K, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD_K, 4, "requires EVEX_K, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K, 4, "requires EVEX_K, L2, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_B, 1, "requires an EVEX_B prefix") \ + ENUM_ENTRY(IC_EVEX_XS_B, 2, "requires EVEX_B and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD_B, 2, "requires EVEX_B and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE_B, 2, "requires EVEX_B and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W_B, 3, "requires EVEX_B and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS_B, 4, "requires EVEX_B, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD_B, 4, "requires EVEX_B, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE_B, 4, "requires EVEX_B, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_B, 3, "requires EVEX_B and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS_B, 4, "requires EVEX_B and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD_B, 4, "requires EVEX_B and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE_B, 4, "requires EVEX_B, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W_B, 3, "requires EVEX_B, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS_B, 4, "requires EVEX_B, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD_B, 4, "requires EVEX_B, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_B, 4, "requires EVEX_B, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_B, 3, "requires EVEX_B and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS_B, 4, "requires EVEX_B and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD_B, 4, "requires EVEX_B and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE_B, 4, "requires EVEX_B, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W_B, 3, "requires EVEX_B, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS_B, 4, "requires EVEX_B, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD_B, 4, "requires EVEX_B, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_B, 4, "requires EVEX_B, L2, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_K_B, 1, "requires EVEX_B and EVEX_K prefix") \ + ENUM_ENTRY(IC_EVEX_XS_K_B, 2, "requires EVEX_B, EVEX_K and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD_K_B, 2, "requires EVEX_B, EVEX_K and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE_K_B, 2, "requires EVEX_B, EVEX_K and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W_K_B, 3, "requires EVEX_B, EVEX_K and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_K_B, 3, "requires EVEX_B, EVEX_K and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS_K_B, 4, "requires EVEX_B, EVEX_K and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD_K_B, 4, "requires EVEX_B, EVEX_K and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W_K_B, 3, "requires EVEX_B, EVEX_K, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K_B,4, "requires EVEX_B, EVEX_K, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_K_B, 3, "requires EVEX_B, EVEX_K and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS_K_B, 4, "requires EVEX_B, EVEX_K and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD_K_B, 4, "requires EVEX_B, EVEX_K and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K_B, 4, "requires EVEX_B, EVEX_K, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W_K_B, 3, "requires EVEX_B, EVEX_K, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD_K_B, 4, "requires EVEX_B, EVEX_K, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K_B,4, "requires EVEX_B, EVEX_K, L2, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_KZ_B, 1, "requires EVEX_B and EVEX_KZ prefix") \ + ENUM_ENTRY(IC_EVEX_XS_KZ_B, 2, "requires EVEX_B, EVEX_KZ and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD_KZ_B, 2, "requires EVEX_B, EVEX_KZ and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE_KZ_B, 2, "requires EVEX_B, EVEX_KZ and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W_KZ_B, 3, "requires EVEX_B, EVEX_KZ and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_KZ_B, 3, "requires EVEX_B, EVEX_KZ and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W_KZ_B, 3, "requires EVEX_B, EVEX_KZ, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_KZ_B, 3, "requires EVEX_B, EVEX_KZ and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W_KZ_B, 3, "requires EVEX_B, EVEX_KZ, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ_B, 4, "requires EVEX_B, EVEX_KZ, L2, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_KZ, 1, "requires an EVEX_KZ prefix") \ + ENUM_ENTRY(IC_EVEX_XS_KZ, 2, "requires EVEX_KZ and the XS prefix") \ + ENUM_ENTRY(IC_EVEX_XD_KZ, 2, "requires EVEX_KZ and the XD prefix") \ + ENUM_ENTRY(IC_EVEX_OPSIZE_KZ, 2, "requires EVEX_KZ and the OpSize prefix") \ + ENUM_ENTRY(IC_EVEX_W_KZ, 3, "requires EVEX_KZ and the W prefix") \ + ENUM_ENTRY(IC_EVEX_W_XS_KZ, 4, "requires EVEX_KZ, W, and XS prefix") \ + ENUM_ENTRY(IC_EVEX_W_XD_KZ, 4, "requires EVEX_KZ, W, and XD prefix") \ + ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ, 4, "requires EVEX_KZ, W, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_KZ, 3, "requires EVEX_KZ and the L prefix") \ + ENUM_ENTRY(IC_EVEX_L_XS_KZ, 4, "requires EVEX_KZ and the L and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L_XD_KZ, 4, "requires EVEX_KZ and the L and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ, 4, "requires EVEX_KZ, L, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L_W_KZ, 3, "requires EVEX_KZ, L and W") \ + ENUM_ENTRY(IC_EVEX_L_W_XS_KZ, 4, "requires EVEX_KZ, L, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_XD_KZ, 4, "requires EVEX_KZ, L, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ, 4, "requires EVEX_KZ, L, W and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_KZ, 3, "requires EVEX_KZ and the L2 prefix") \ + ENUM_ENTRY(IC_EVEX_L2_XS_KZ, 4, "requires EVEX_KZ and the L2 and XS prefix")\ + ENUM_ENTRY(IC_EVEX_L2_XD_KZ, 4, "requires EVEX_KZ and the L2 and XD prefix")\ + ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ, 4, "requires EVEX_KZ, L2, and OpSize") \ + ENUM_ENTRY(IC_EVEX_L2_W_KZ, 3, "requires EVEX_KZ, L2 and W") \ + ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ, 4, "requires EVEX_KZ, L2, W and XS prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ, 4, "requires EVEX_KZ, L2, W and XD prefix") \ + ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4, "requires EVEX_KZ, L2, W and OpSize") + +#define ENUM_ENTRY(n, r, d) n, +enum InstructionContext { + INSTRUCTION_CONTEXTS + IC_max +}; +#undef ENUM_ENTRY + +// Opcode types, which determine which decode table to use, both in the Intel +// manual and also for the decoder. +enum OpcodeType { + ONEBYTE = 0, + TWOBYTE = 1, + THREEBYTE_38 = 2, + THREEBYTE_3A = 3, + XOP8_MAP = 4, + XOP9_MAP = 5, + XOPA_MAP = 6 +}; + +// The following structs are used for the hierarchical decode table. After +// determining the instruction's class (i.e., which IC_* constant applies to +// it), the decoder reads the opcode. Some instructions require specific +// values of the ModR/M byte, so the ModR/M byte indexes into the final table. +// +// If a ModR/M byte is not required, "required" is left unset, and the values +// for each instructionID are identical. +typedef uint16_t InstrUID; + +// ModRMDecisionType - describes the type of ModR/M decision, allowing the +// consumer to determine the number of entries in it. +// +// MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded +// instruction is the same. +// MODRM_SPLITRM - If the ModR/M byte is between 0x00 and 0xbf, the opcode +// corresponds to one instruction; otherwise, it corresponds to +// a different instruction. +// MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte +// divided by 8 is used to select instruction; otherwise, each +// value of the ModR/M byte could correspond to a different +// instruction. +// MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This +// corresponds to instructions that use reg field as opcode +// MODRM_FULL - Potentially, each value of the ModR/M byte could correspond +// to a different instruction. +#define MODRMTYPES \ + ENUM_ENTRY(MODRM_ONEENTRY) \ + ENUM_ENTRY(MODRM_SPLITRM) \ + ENUM_ENTRY(MODRM_SPLITMISC) \ + ENUM_ENTRY(MODRM_SPLITREG) \ + ENUM_ENTRY(MODRM_FULL) + +#define ENUM_ENTRY(n) n, +enum ModRMDecisionType { + MODRMTYPES + MODRM_max +}; +#undef ENUM_ENTRY + +#define CASE_ENCODING_RM \ + case ENCODING_RM: \ + case ENCODING_RM_CD2: \ + case ENCODING_RM_CD4: \ + case ENCODING_RM_CD8: \ + case ENCODING_RM_CD16: \ + case ENCODING_RM_CD32: \ + case ENCODING_RM_CD64 + +// Physical encodings of instruction operands. +#define ENCODINGS \ + ENUM_ENTRY(ENCODING_NONE, "") \ + ENUM_ENTRY(ENCODING_REG, "Register operand in ModR/M byte.") \ + ENUM_ENTRY(ENCODING_RM, "R/M operand in ModR/M byte.") \ + ENUM_ENTRY(ENCODING_RM_CD2, "R/M operand with CDisp scaling of 2") \ + ENUM_ENTRY(ENCODING_RM_CD4, "R/M operand with CDisp scaling of 4") \ + ENUM_ENTRY(ENCODING_RM_CD8, "R/M operand with CDisp scaling of 8") \ + ENUM_ENTRY(ENCODING_RM_CD16,"R/M operand with CDisp scaling of 16") \ + ENUM_ENTRY(ENCODING_RM_CD32,"R/M operand with CDisp scaling of 32") \ + ENUM_ENTRY(ENCODING_RM_CD64,"R/M operand with CDisp scaling of 64") \ + ENUM_ENTRY(ENCODING_VVVV, "Register operand in VEX.vvvv byte.") \ + ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.") \ + ENUM_ENTRY(ENCODING_CB, "1-byte code offset (possible new CS value)") \ + ENUM_ENTRY(ENCODING_CW, "2-byte") \ + ENUM_ENTRY(ENCODING_CD, "4-byte") \ + ENUM_ENTRY(ENCODING_CP, "6-byte") \ + ENUM_ENTRY(ENCODING_CO, "8-byte") \ + ENUM_ENTRY(ENCODING_CT, "10-byte") \ + ENUM_ENTRY(ENCODING_IB, "1-byte immediate") \ + ENUM_ENTRY(ENCODING_IW, "2-byte") \ + ENUM_ENTRY(ENCODING_ID, "4-byte") \ + ENUM_ENTRY(ENCODING_IO, "8-byte") \ + ENUM_ENTRY(ENCODING_RB, "(AL..DIL, R8L..R15L) Register code added to " \ + "the opcode byte") \ + ENUM_ENTRY(ENCODING_RW, "(AX..DI, R8W..R15W)") \ + ENUM_ENTRY(ENCODING_RD, "(EAX..EDI, R8D..R15D)") \ + ENUM_ENTRY(ENCODING_RO, "(RAX..RDI, R8..R15)") \ + ENUM_ENTRY(ENCODING_FP, "Position on floating-point stack in ModR/M " \ + "byte.") \ + \ + ENUM_ENTRY(ENCODING_Iv, "Immediate of operand size") \ + ENUM_ENTRY(ENCODING_Ia, "Immediate of address size") \ + ENUM_ENTRY(ENCODING_Rv, "Register code of operand size added to the " \ + "opcode byte") \ + ENUM_ENTRY(ENCODING_DUP, "Duplicate of another operand; ID is encoded " \ + "in type") \ + ENUM_ENTRY(ENCODING_SI, "Source index; encoded in OpSize/Adsize prefix") \ + ENUM_ENTRY(ENCODING_DI, "Destination index; encoded in prefixes") + +#define ENUM_ENTRY(n, d) n, +enum OperandEncoding { + ENCODINGS + ENCODING_max +}; +#undef ENUM_ENTRY + +// Semantic interpretations of instruction operands. +#define TYPES \ + ENUM_ENTRY(TYPE_NONE, "") \ + ENUM_ENTRY(TYPE_REL8, "1-byte immediate address") \ + ENUM_ENTRY(TYPE_REL16, "2-byte") \ + ENUM_ENTRY(TYPE_REL32, "4-byte") \ + ENUM_ENTRY(TYPE_REL64, "8-byte") \ + ENUM_ENTRY(TYPE_PTR1616, "2+2-byte segment+offset address") \ + ENUM_ENTRY(TYPE_PTR1632, "2+4-byte") \ + ENUM_ENTRY(TYPE_PTR1664, "2+8-byte") \ + ENUM_ENTRY(TYPE_R8, "1-byte register operand") \ + ENUM_ENTRY(TYPE_R16, "2-byte") \ + ENUM_ENTRY(TYPE_R32, "4-byte") \ + ENUM_ENTRY(TYPE_R64, "8-byte") \ + ENUM_ENTRY(TYPE_IMM8, "1-byte immediate operand") \ + ENUM_ENTRY(TYPE_IMM16, "2-byte") \ + ENUM_ENTRY(TYPE_IMM32, "4-byte") \ + ENUM_ENTRY(TYPE_IMM64, "8-byte") \ + ENUM_ENTRY(TYPE_IMM3, "1-byte immediate operand between 0 and 7") \ + ENUM_ENTRY(TYPE_IMM5, "1-byte immediate operand between 0 and 31") \ + ENUM_ENTRY(TYPE_AVX512ICC, "1-byte immediate operand for AVX512 icmp") \ + ENUM_ENTRY(TYPE_UIMM8, "1-byte unsigned immediate operand") \ + ENUM_ENTRY(TYPE_RM8, "1-byte register or memory operand") \ + ENUM_ENTRY(TYPE_RM16, "2-byte") \ + ENUM_ENTRY(TYPE_RM32, "4-byte") \ + ENUM_ENTRY(TYPE_RM64, "8-byte") \ + ENUM_ENTRY(TYPE_M, "Memory operand") \ + ENUM_ENTRY(TYPE_M8, "1-byte") \ + ENUM_ENTRY(TYPE_M16, "2-byte") \ + ENUM_ENTRY(TYPE_M32, "4-byte") \ + ENUM_ENTRY(TYPE_M64, "8-byte") \ + ENUM_ENTRY(TYPE_LEA, "Effective address") \ + ENUM_ENTRY(TYPE_M128, "16-byte (SSE/SSE2)") \ + ENUM_ENTRY(TYPE_M256, "256-byte (AVX)") \ + ENUM_ENTRY(TYPE_M1616, "2+2-byte segment+offset address") \ + ENUM_ENTRY(TYPE_M1632, "2+4-byte") \ + ENUM_ENTRY(TYPE_M1664, "2+8-byte") \ + ENUM_ENTRY(TYPE_SRCIDX8, "1-byte memory at source index") \ + ENUM_ENTRY(TYPE_SRCIDX16, "2-byte memory at source index") \ + ENUM_ENTRY(TYPE_SRCIDX32, "4-byte memory at source index") \ + ENUM_ENTRY(TYPE_SRCIDX64, "8-byte memory at source index") \ + ENUM_ENTRY(TYPE_DSTIDX8, "1-byte memory at destination index") \ + ENUM_ENTRY(TYPE_DSTIDX16, "2-byte memory at destination index") \ + ENUM_ENTRY(TYPE_DSTIDX32, "4-byte memory at destination index") \ + ENUM_ENTRY(TYPE_DSTIDX64, "8-byte memory at destination index") \ + ENUM_ENTRY(TYPE_MOFFS8, "1-byte memory offset (relative to segment " \ + "base)") \ + ENUM_ENTRY(TYPE_MOFFS16, "2-byte") \ + ENUM_ENTRY(TYPE_MOFFS32, "4-byte") \ + ENUM_ENTRY(TYPE_MOFFS64, "8-byte") \ + ENUM_ENTRY(TYPE_SREG, "Byte with single bit set: 0 = ES, 1 = CS, " \ + "2 = SS, 3 = DS, 4 = FS, 5 = GS") \ + ENUM_ENTRY(TYPE_M32FP, "32-bit IEE754 memory floating-point operand") \ + ENUM_ENTRY(TYPE_M64FP, "64-bit") \ + ENUM_ENTRY(TYPE_M80FP, "80-bit extended") \ + ENUM_ENTRY(TYPE_ST, "Position on the floating-point stack") \ + ENUM_ENTRY(TYPE_MM64, "8-byte MMX register") \ + ENUM_ENTRY(TYPE_XMM, "XMM register operand") \ + ENUM_ENTRY(TYPE_XMM32, "4-byte XMM register or memory operand") \ + ENUM_ENTRY(TYPE_XMM64, "8-byte") \ + ENUM_ENTRY(TYPE_XMM128, "16-byte") \ + ENUM_ENTRY(TYPE_XMM256, "32-byte") \ + ENUM_ENTRY(TYPE_XMM512, "64-byte") \ + ENUM_ENTRY(TYPE_VK1, "1-bit") \ + ENUM_ENTRY(TYPE_VK2, "2-bit") \ + ENUM_ENTRY(TYPE_VK4, "4-bit") \ + ENUM_ENTRY(TYPE_VK8, "8-bit") \ + ENUM_ENTRY(TYPE_VK16, "16-bit") \ + ENUM_ENTRY(TYPE_VK32, "32-bit") \ + ENUM_ENTRY(TYPE_VK64, "64-bit") \ + ENUM_ENTRY(TYPE_XMM0, "Implicit use of XMM0") \ + ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \ + ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \ + ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand") \ + ENUM_ENTRY(TYPE_BNDR, "MPX bounds register") \ + \ + ENUM_ENTRY(TYPE_Mv, "Memory operand of operand size") \ + ENUM_ENTRY(TYPE_Rv, "Register operand of operand size") \ + ENUM_ENTRY(TYPE_IMMv, "Immediate operand of operand size") \ + ENUM_ENTRY(TYPE_RELv, "Immediate address of operand size") \ + ENUM_ENTRY(TYPE_DUP0, "Duplicate of operand 0") \ + ENUM_ENTRY(TYPE_DUP1, "operand 1") \ + ENUM_ENTRY(TYPE_DUP2, "operand 2") \ + ENUM_ENTRY(TYPE_DUP3, "operand 3") \ + ENUM_ENTRY(TYPE_DUP4, "operand 4") \ + ENUM_ENTRY(TYPE_M512, "512-bit FPU/MMX/XMM/MXCSR state") + +#define ENUM_ENTRY(n, d) n, +enum OperandType { + TYPES + TYPE_max +}; +#undef ENUM_ENTRY + +/// \brief The specification for how to extract and interpret one operand. +struct OperandSpecifier { + uint8_t encoding; + uint8_t type; +}; + +static const unsigned X86_MAX_OPERANDS = 6; + +/// Decoding mode for the Intel disassembler. 16-bit, 32-bit, and 64-bit mode +/// are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode, +/// respectively. +enum DisassemblerMode { + MODE_16BIT, + MODE_32BIT, + MODE_64BIT +}; + +} // namespace X86Disassembler +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp new file mode 100644 index 0000000..b4c0bc4 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -0,0 +1,289 @@ +//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file includes code for rendering MCInst instances as AT&T-style +// assembly. +// +//===----------------------------------------------------------------------===// + +#include "X86ATTInstPrinter.h" +#include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "X86InstComments.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/FormattedStream.h" +#include <map> +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +// Include the auto-generated portion of the assembly writer. +#define PRINT_ALIAS_INSTR +#include "X86GenAsmWriter.inc" + +void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << markup("<reg:") << '%' << getRegisterName(RegNo) << markup(">"); +} + +void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annot, const MCSubtargetInfo &STI) { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + uint64_t TSFlags = Desc.TSFlags; + + // If verbose assembly is enabled, we can print some informative comments. + if (CommentStream) + HasCustomInstComment = + EmitAnyX86InstComments(MI, *CommentStream, getRegisterName); + + if (TSFlags & X86II::LOCK) + OS << "\tlock\t"; + + // Output CALLpcrel32 as "callq" in 64-bit mode. + // In Intel annotation it's always emitted as "call". + // + // TODO: Probably this hack should be redesigned via InstAlias in + // InstrInfo.td as soon as Requires clause is supported properly + // for InstAlias. + if (MI->getOpcode() == X86::CALLpcrel32 && + (STI.getFeatureBits()[X86::Mode64Bit])) { + OS << "\tcallq\t"; + printPCRelImm(MI, 0, OS); + } + // Try to print any aliases first. + else if (!printAliasInstr(MI, OS)) + printInstruction(MI, OS); + + // Next always print the annotation. + printAnnotation(OS, Annot); +} + +void X86ATTInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); + switch (Imm) { + default: llvm_unreachable("Invalid ssecc/avxcc argument!"); + case 0: O << "eq"; break; + case 1: O << "lt"; break; + case 2: O << "le"; break; + case 3: O << "unord"; break; + case 4: O << "neq"; break; + case 5: O << "nlt"; break; + case 6: O << "nle"; break; + case 7: O << "ord"; break; + case 8: O << "eq_uq"; break; + case 9: O << "nge"; break; + case 0xa: O << "ngt"; break; + case 0xb: O << "false"; break; + case 0xc: O << "neq_oq"; break; + case 0xd: O << "ge"; break; + case 0xe: O << "gt"; break; + case 0xf: O << "true"; break; + case 0x10: O << "eq_os"; break; + case 0x11: O << "lt_oq"; break; + case 0x12: O << "le_oq"; break; + case 0x13: O << "unord_s"; break; + case 0x14: O << "neq_us"; break; + case 0x15: O << "nlt_uq"; break; + case 0x16: O << "nle_uq"; break; + case 0x17: O << "ord_s"; break; + case 0x18: O << "eq_us"; break; + case 0x19: O << "nge_uq"; break; + case 0x1a: O << "ngt_uq"; break; + case 0x1b: O << "false_os"; break; + case 0x1c: O << "neq_os"; break; + case 0x1d: O << "ge_oq"; break; + case 0x1e: O << "gt_oq"; break; + case 0x1f: O << "true_us"; break; + } +} + +void X86ATTInstPrinter::printXOPCC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); + switch (Imm) { + default: llvm_unreachable("Invalid xopcc argument!"); + case 0: O << "lt"; break; + case 1: O << "le"; break; + case 2: O << "gt"; break; + case 3: O << "ge"; break; + case 4: O << "eq"; break; + case 5: O << "neq"; break; + case 6: O << "false"; break; + case 7: O << "true"; break; + } +} + +void X86ATTInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm() & 0x3; + switch (Imm) { + case 0: O << "{rn-sae}"; break; + case 1: O << "{rd-sae}"; break; + case 2: O << "{ru-sae}"; break; + case 3: O << "{rz-sae}"; break; + } +} +/// printPCRelImm - This is used to print an immediate value that ends up +/// being encoded as a pc-relative value (e.g. for jumps and calls). These +/// print slightly differently than normal immediates. For example, a $ is not +/// emitted. +void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isImm()) + O << formatImm(Op.getImm()); + else { + assert(Op.isExpr() && "unknown pcrel immediate operand"); + // If a symbolic branch target was added as a constant expression then print + // that address in hex. + const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr()); + int64_t Address; + if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) { + O << formatHex((uint64_t)Address); + } else { + // Otherwise, just print the expression. + Op.getExpr()->print(O, &MAI); + } + } +} + +void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + printRegName(O, Op.getReg()); + } else if (Op.isImm()) { + // Print X86 immediates as signed values. + O << markup("<imm:") << '$' << formatImm((int64_t)Op.getImm()) + << markup(">"); + + // If there are no instruction-specific comments, add a comment clarifying + // the hex value of the immediate operand when it isn't in the range + // [-256,255]. + if (CommentStream && !HasCustomInstComment && + (Op.getImm() > 255 || Op.getImm() < -256)) + *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm()); + + } else { + assert(Op.isExpr() && "unknown operand kind in printOperand"); + O << markup("<imm:") << '$'; + Op.getExpr()->print(O, &MAI); + O << markup(">"); + } +} + +void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &BaseReg = MI->getOperand(Op + X86::AddrBaseReg); + const MCOperand &IndexReg = MI->getOperand(Op + X86::AddrIndexReg); + const MCOperand &DispSpec = MI->getOperand(Op + X86::AddrDisp); + const MCOperand &SegReg = MI->getOperand(Op + X86::AddrSegmentReg); + + O << markup("<mem:"); + + // If this has a segment register, print it. + if (SegReg.getReg()) { + printOperand(MI, Op + X86::AddrSegmentReg, O); + O << ':'; + } + + if (DispSpec.isImm()) { + int64_t DispVal = DispSpec.getImm(); + if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) + O << formatImm(DispVal); + } else { + assert(DispSpec.isExpr() && "non-immediate displacement for LEA?"); + DispSpec.getExpr()->print(O, &MAI); + } + + if (IndexReg.getReg() || BaseReg.getReg()) { + O << '('; + if (BaseReg.getReg()) + printOperand(MI, Op + X86::AddrBaseReg, O); + + if (IndexReg.getReg()) { + O << ','; + printOperand(MI, Op + X86::AddrIndexReg, O); + unsigned ScaleVal = MI->getOperand(Op + X86::AddrScaleAmt).getImm(); + if (ScaleVal != 1) { + O << ',' << markup("<imm:") << ScaleVal // never printed in hex. + << markup(">"); + } + } + O << ')'; + } + + O << markup(">"); +} + +void X86ATTInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &SegReg = MI->getOperand(Op + 1); + + O << markup("<mem:"); + + // If this has a segment register, print it. + if (SegReg.getReg()) { + printOperand(MI, Op + 1, O); + O << ':'; + } + + O << "("; + printOperand(MI, Op, O); + O << ")"; + + O << markup(">"); +} + +void X86ATTInstPrinter::printDstIdx(const MCInst *MI, unsigned Op, + raw_ostream &O) { + O << markup("<mem:"); + + O << "%es:("; + printOperand(MI, Op, O); + O << ")"; + + O << markup(">"); +} + +void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &DispSpec = MI->getOperand(Op); + const MCOperand &SegReg = MI->getOperand(Op + 1); + + O << markup("<mem:"); + + // If this has a segment register, print it. + if (SegReg.getReg()) { + printOperand(MI, Op + 1, O); + O << ':'; + } + + if (DispSpec.isImm()) { + O << formatImm(DispSpec.getImm()); + } else { + assert(DispSpec.isExpr() && "non-immediate displacement?"); + DispSpec.getExpr()->print(O, &MAI); + } + + O << markup(">"); +} + +void X86ATTInstPrinter::printU8Imm(const MCInst *MI, unsigned Op, + raw_ostream &O) { + O << markup("<imm:") << '$' << formatImm(MI->getOperand(Op).getImm() & 0xff) + << markup(">"); +} diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h new file mode 100644 index 0000000..bbb3090 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -0,0 +1,142 @@ +//==- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an X86 MCInst to AT&T style .s file syntax. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H +#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H + +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { + +class X86ATTInstPrinter final : public MCInstPrinter { +public: + X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, + const MCSubtargetInfo &STI) override; + + // Autogenerated by tblgen, returns true if we successfully printed an + // alias. + bool printAliasInstr(const MCInst *MI, raw_ostream &OS); + void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, raw_ostream &O); + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &OS); + static const char *getRegisterName(unsigned RegNo); + + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS); + void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS); + void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS); + void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &OS); + void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS); + void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &OS); + + void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + + void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + + void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + + void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printSrcIdx(MI, OpNo, O); + } + void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printSrcIdx(MI, OpNo, O); + } + void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printSrcIdx(MI, OpNo, O); + } + void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printSrcIdx(MI, OpNo, O); + } + void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printDstIdx(MI, OpNo, O); + } + void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printDstIdx(MI, OpNo, O); + } + void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printDstIdx(MI, OpNo, O); + } + void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printDstIdx(MI, OpNo, O); + } + void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemOffset(MI, OpNo, O); + } + void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemOffset(MI, OpNo, O); + } + void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemOffset(MI, OpNo, O); + } + void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemOffset(MI, OpNo, O); + } + +private: + bool HasCustomInstComment; +}; +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp new file mode 100644 index 0000000..73f654c --- /dev/null +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -0,0 +1,820 @@ +//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This defines functionality used to emit comments about X86 instructions to +// an output stream for -fverbose-asm. +// +//===----------------------------------------------------------------------===// + +#include "X86InstComments.h" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "Utils/X86ShuffleDecode.h" +#include "llvm/MC/MCInst.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +static unsigned getVectorRegSize(unsigned RegNo) { + if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31) + return 512; + if (X86::YMM0 <= RegNo && RegNo <= X86::YMM31) + return 256; + if (X86::XMM0 <= RegNo && RegNo <= X86::XMM31) + return 128; + if (X86::MM0 <= RegNo && RegNo <= X86::MM7) + return 64; + + llvm_unreachable("Unknown vector reg!"); +} + +static MVT getRegOperandVectorVT(const MCInst *MI, const MVT &ScalarVT, + unsigned OperandIndex) { + unsigned OpReg = MI->getOperand(OperandIndex).getReg(); + return MVT::getVectorVT(ScalarVT, + getVectorRegSize(OpReg)/ScalarVT.getSizeInBits()); +} + +/// \brief Extracts the src/dst types for a given zero extension instruction. +/// \note While the number of elements in DstVT type correct, the +/// number in the SrcVT type is expanded to fill the src xmm register and the +/// upper elements may not be included in the dst xmm/ymm register. +static void getZeroExtensionTypes(const MCInst *MI, MVT &SrcVT, MVT &DstVT) { + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unknown zero extension instruction"); + // i8 zero extension + case X86::PMOVZXBWrm: + case X86::PMOVZXBWrr: + case X86::VPMOVZXBWrm: + case X86::VPMOVZXBWrr: + SrcVT = MVT::v16i8; + DstVT = MVT::v8i16; + break; + case X86::VPMOVZXBWYrm: + case X86::VPMOVZXBWYrr: + SrcVT = MVT::v16i8; + DstVT = MVT::v16i16; + break; + case X86::PMOVZXBDrm: + case X86::PMOVZXBDrr: + case X86::VPMOVZXBDrm: + case X86::VPMOVZXBDrr: + SrcVT = MVT::v16i8; + DstVT = MVT::v4i32; + break; + case X86::VPMOVZXBDYrm: + case X86::VPMOVZXBDYrr: + SrcVT = MVT::v16i8; + DstVT = MVT::v8i32; + break; + case X86::PMOVZXBQrm: + case X86::PMOVZXBQrr: + case X86::VPMOVZXBQrm: + case X86::VPMOVZXBQrr: + SrcVT = MVT::v16i8; + DstVT = MVT::v2i64; + break; + case X86::VPMOVZXBQYrm: + case X86::VPMOVZXBQYrr: + SrcVT = MVT::v16i8; + DstVT = MVT::v4i64; + break; + // i16 zero extension + case X86::PMOVZXWDrm: + case X86::PMOVZXWDrr: + case X86::VPMOVZXWDrm: + case X86::VPMOVZXWDrr: + SrcVT = MVT::v8i16; + DstVT = MVT::v4i32; + break; + case X86::VPMOVZXWDYrm: + case X86::VPMOVZXWDYrr: + SrcVT = MVT::v8i16; + DstVT = MVT::v8i32; + break; + case X86::PMOVZXWQrm: + case X86::PMOVZXWQrr: + case X86::VPMOVZXWQrm: + case X86::VPMOVZXWQrr: + SrcVT = MVT::v8i16; + DstVT = MVT::v2i64; + break; + case X86::VPMOVZXWQYrm: + case X86::VPMOVZXWQYrr: + SrcVT = MVT::v8i16; + DstVT = MVT::v4i64; + break; + // i32 zero extension + case X86::PMOVZXDQrm: + case X86::PMOVZXDQrr: + case X86::VPMOVZXDQrm: + case X86::VPMOVZXDQrr: + SrcVT = MVT::v4i32; + DstVT = MVT::v2i64; + break; + case X86::VPMOVZXDQYrm: + case X86::VPMOVZXDQYrr: + SrcVT = MVT::v4i32; + DstVT = MVT::v4i64; + break; + } +} + +#define CASE_MASK_INS_COMMON(Inst, Suffix, src) \ + case X86::V##Inst##Suffix##src: \ + case X86::V##Inst##Suffix##src##k: \ + case X86::V##Inst##Suffix##src##kz: + +#define CASE_SSE_INS_COMMON(Inst, src) \ + case X86::Inst##src: + +#define CASE_AVX_INS_COMMON(Inst, Suffix, src) \ + case X86::V##Inst##Suffix##src: + +#define CASE_MOVDUP(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z256, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z128, r##src) \ + CASE_AVX_INS_COMMON(Inst, , r##src) \ + CASE_AVX_INS_COMMON(Inst, Y, r##src) \ + CASE_SSE_INS_COMMON(Inst, r##src) \ + +#define CASE_UNPCK(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z256, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z128, r##src) \ + CASE_AVX_INS_COMMON(Inst, , r##src) \ + CASE_AVX_INS_COMMON(Inst, Y, r##src) \ + CASE_SSE_INS_COMMON(Inst, r##src) \ + +#define CASE_SHUF(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, r##src##i) \ + CASE_MASK_INS_COMMON(Inst, Z256, r##src##i) \ + CASE_MASK_INS_COMMON(Inst, Z128, r##src##i) \ + CASE_AVX_INS_COMMON(Inst, , r##src##i) \ + CASE_AVX_INS_COMMON(Inst, Y, r##src##i) \ + CASE_SSE_INS_COMMON(Inst, r##src##i) \ + +#define CASE_VPERM(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, src##i) \ + CASE_MASK_INS_COMMON(Inst, Z256, src##i) \ + CASE_MASK_INS_COMMON(Inst, Z128, src##i) \ + CASE_AVX_INS_COMMON(Inst, , src##i) \ + CASE_AVX_INS_COMMON(Inst, Y, src##i) \ + +#define CASE_VSHUF(Inst, src) \ + CASE_MASK_INS_COMMON(SHUFF##Inst, Z, r##src##i) \ + CASE_MASK_INS_COMMON(SHUFI##Inst, Z, r##src##i) \ + CASE_MASK_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \ + CASE_MASK_INS_COMMON(SHUFI##Inst, Z256, r##src##i) \ + +/// \brief Extracts the types and if it has memory operand for a given +/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) instruction. +static void getVSHUF64x2FamilyInfo(const MCInst *MI, MVT &VT, bool &HasMemOp) { + HasMemOp = false; + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unknown VSHUF64x2 family instructions."); + break; + CASE_VSHUF(64X2, m) + HasMemOp = true; // FALL THROUGH. + CASE_VSHUF(64X2, r) + VT = getRegOperandVectorVT(MI, MVT::i64, 0); + break; + CASE_VSHUF(32X4, m) + HasMemOp = true; // FALL THROUGH. + CASE_VSHUF(32X4, r) + VT = getRegOperandVectorVT(MI, MVT::i32, 0); + break; + } +} + +//===----------------------------------------------------------------------===// +// Top Level Entrypoint +//===----------------------------------------------------------------------===// + +/// EmitAnyX86InstComments - This function decodes x86 instructions and prints +/// newline terminated strings to the specified string if desired. This +/// information is shown in disassembly dumps when verbose assembly is enabled. +bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, + const char *(*getRegName)(unsigned)) { + // If this is a shuffle operation, the switch should fill in this state. + SmallVector<int, 8> ShuffleMask; + const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr; + + switch (MI->getOpcode()) { + default: + // Not an instruction for which we can decode comments. + return false; + + case X86::BLENDPDrri: + case X86::VBLENDPDrri: + case X86::VBLENDPDYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::BLENDPDrmi: + case X86::VBLENDPDrmi: + case X86::VBLENDPDYrmi: + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f64, 0), + MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::BLENDPSrri: + case X86::VBLENDPSrri: + case X86::VBLENDPSYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::BLENDPSrmi: + case X86::VBLENDPSrmi: + case X86::VBLENDPSYrmi: + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f32, 0), + MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::PBLENDWrri: + case X86::VPBLENDWrri: + case X86::VPBLENDWYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PBLENDWrmi: + case X86::VPBLENDWrmi: + case X86::VPBLENDWYrmi: + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i16, 0), + MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::VPBLENDDrri: + case X86::VPBLENDDYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPBLENDDrmi: + case X86::VPBLENDDYrmi: + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i32, 0), + MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::INSERTPSrr: + case X86::VINSERTPSrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::INSERTPSrm: + case X86::VINSERTPSrm: + DestName = getRegName(MI->getOperand(0).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodeINSERTPSMask(MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); + break; + + case X86::MOVLHPSrr: + case X86::VMOVLHPSrr: + case X86::VMOVLHPSZrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVLHPSMask(2, ShuffleMask); + break; + + case X86::MOVHLPSrr: + case X86::VMOVHLPSrr: + case X86::VMOVHLPSZrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVHLPSMask(2, ShuffleMask); + break; + + CASE_MOVDUP(MOVSLDUP, r) + Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg()); + // FALL THROUGH. + CASE_MOVDUP(MOVSLDUP, m) + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVSLDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); + break; + + CASE_MOVDUP(MOVSHDUP, r) + Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg()); + // FALL THROUGH. + CASE_MOVDUP(MOVSHDUP, m) + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVSHDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); + break; + + CASE_MOVDUP(MOVDDUP, r) + Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg()); + // FALL THROUGH. + CASE_MOVDUP(MOVDDUP, m) + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVDDUPMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask); + break; + + case X86::PSLLDQri: + case X86::VPSLLDQri: + case X86::VPSLLDQYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodePSLLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0), + MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); + break; + + case X86::PSRLDQri: + case X86::VPSRLDQri: + case X86::VPSRLDQYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodePSRLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0), + MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); + break; + + case X86::PALIGNR128rr: + case X86::VPALIGNR128rr: + case X86::VPALIGNR256rr: + Src1Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PALIGNR128rm: + case X86::VPALIGNR128rm: + case X86::VPALIGNR256rm: + Src2Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodePALIGNRMask(getRegOperandVectorVT(MI, MVT::i8, 0), + MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); + break; + + case X86::PSHUFDri: + case X86::VPSHUFDri: + case X86::VPSHUFDYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::PSHUFDmi: + case X86::VPSHUFDmi: + case X86::VPSHUFDYmi: + DestName = getRegName(MI->getOperand(0).getReg()); + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::i32, 0), + MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); + break; + + case X86::PSHUFHWri: + case X86::VPSHUFHWri: + case X86::VPSHUFHWYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::PSHUFHWmi: + case X86::VPSHUFHWmi: + case X86::VPSHUFHWYmi: + DestName = getRegName(MI->getOperand(0).getReg()); + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodePSHUFHWMask(getRegOperandVectorVT(MI, MVT::i16, 0), + MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); + break; + + case X86::PSHUFLWri: + case X86::VPSHUFLWri: + case X86::VPSHUFLWYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::PSHUFLWmi: + case X86::VPSHUFLWmi: + case X86::VPSHUFLWYmi: + DestName = getRegName(MI->getOperand(0).getReg()); + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodePSHUFLWMask(getRegOperandVectorVT(MI, MVT::i16, 0), + MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); + break; + + case X86::MMX_PSHUFWri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::MMX_PSHUFWmi: + DestName = getRegName(MI->getOperand(0).getReg()); + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodePSHUFMask(MVT::v4i16, + MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); + break; + + case X86::PSWAPDrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::PSWAPDrm: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodePSWAPMask(MVT::v2i32, ShuffleMask); + break; + + CASE_UNPCK(PUNPCKHBW, r) + case X86::MMX_PUNPCKHBWirr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + CASE_UNPCK(PUNPCKHBW, m) + case X86::MMX_PUNPCKHBWirm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask); + break; + + CASE_UNPCK(PUNPCKHWD, r) + case X86::MMX_PUNPCKHWDirr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + CASE_UNPCK(PUNPCKHWD, m) + case X86::MMX_PUNPCKHWDirm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask); + break; + + CASE_UNPCK(PUNPCKHDQ, r) + case X86::MMX_PUNPCKHDQirr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + CASE_UNPCK(PUNPCKHDQ, m) + case X86::MMX_PUNPCKHDQirm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask); + break; + + CASE_UNPCK(PUNPCKHQDQ, r) + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + CASE_UNPCK(PUNPCKHQDQ, m) + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask); + break; + + CASE_UNPCK(PUNPCKLBW, r) + case X86::MMX_PUNPCKLBWirr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + CASE_UNPCK(PUNPCKLBW, m) + case X86::MMX_PUNPCKLBWirm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask); + break; + + CASE_UNPCK(PUNPCKLWD, r) + case X86::MMX_PUNPCKLWDirr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + CASE_UNPCK(PUNPCKLWD, m) + case X86::MMX_PUNPCKLWDirm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask); + break; + + CASE_UNPCK(PUNPCKLDQ, r) + case X86::MMX_PUNPCKLDQirr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + CASE_UNPCK(PUNPCKLDQ, m) + case X86::MMX_PUNPCKLDQirm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask); + break; + + CASE_UNPCK(PUNPCKLQDQ, r) + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + CASE_UNPCK(PUNPCKLQDQ, m) + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask); + break; + + CASE_SHUF(SHUFPD, r) + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + CASE_SHUF(SHUFPD, m) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f64, 0), + MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_SHUF(SHUFPS, r) + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + CASE_SHUF(SHUFPS, m) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f32, 0), + MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_VSHUF(64X2, r) + CASE_VSHUF(64X2, m) + CASE_VSHUF(32X4, r) + CASE_VSHUF(32X4, m) { + MVT VT; + bool HasMemOp; + unsigned NumOp = MI->getNumOperands(); + getVSHUF64x2FamilyInfo(MI, VT, HasMemOp); + decodeVSHUF64x2FamilyMask(VT, MI->getOperand(NumOp - 1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + if (HasMemOp) { + assert((NumOp >= 8) && "Expected at least 8 operands!"); + Src1Name = getRegName(MI->getOperand(NumOp - 7).getReg()); + } else { + assert((NumOp >= 4) && "Expected at least 4 operands!"); + Src2Name = getRegName(MI->getOperand(NumOp - 2).getReg()); + Src1Name = getRegName(MI->getOperand(NumOp - 3).getReg()); + } + break; + } + + CASE_UNPCK(UNPCKLPD, r) + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + CASE_UNPCK(UNPCKLPD, m) + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_UNPCK(UNPCKLPS, r) + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + CASE_UNPCK(UNPCKLPS, m) + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_UNPCK(UNPCKHPD, r) + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + CASE_UNPCK(UNPCKHPD, m) + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_UNPCK(UNPCKHPS, r) + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + CASE_UNPCK(UNPCKHPS, m) + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_VPERM(PERMILPS, r) + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + CASE_VPERM(PERMILPS, m) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f32, 0), + MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + CASE_VPERM(PERMILPD, r) + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + CASE_VPERM(PERMILPD, m) + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f64, 0), + MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::VPERM2F128rr: + case X86::VPERM2I128rr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPERM2F128rm: + case X86::VPERM2I128rm: + // For instruction comments purpose, assume the 256-bit vector is v4i64. + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodeVPERM2X128Mask(MVT::v4i64, + MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::VPERMQYri: + case X86::VPERMPDYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VPERMQYmi: + case X86::VPERMPDYmi: + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodeVPERMMask(MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::MOVSDrr: + case X86::VMOVSDrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::MOVSDrm: + case X86::VMOVSDrm: + DecodeScalarMoveMask(MVT::v2f64, nullptr == Src2Name, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::MOVSSrr: + case X86::VMOVSSrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::MOVSSrm: + case X86::VMOVSSrm: + DecodeScalarMoveMask(MVT::v4f32, nullptr == Src2Name, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::MOVPQI2QIrr: + case X86::MOVZPQILo2PQIrr: + case X86::VMOVPQI2QIrr: + case X86::VMOVZPQILo2PQIrr: + case X86::VMOVZPQILo2PQIZrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::MOVQI2PQIrm: + case X86::MOVZQI2PQIrm: + case X86::MOVZPQILo2PQIrm: + case X86::VMOVQI2PQIrm: + case X86::VMOVZQI2PQIrm: + case X86::VMOVZPQILo2PQIrm: + case X86::VMOVZPQILo2PQIZrm: + DecodeZeroMoveLowMask(MVT::v2i64, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::MOVDI2PDIrm: + case X86::VMOVDI2PDIrm: + DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::EXTRQI: + if (MI->getOperand(2).isImm() && + MI->getOperand(3).isImm()) + DecodeEXTRQIMask(MI->getOperand(2).getImm(), + MI->getOperand(3).getImm(), + ShuffleMask); + + DestName = getRegName(MI->getOperand(0).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + break; + + case X86::INSERTQI: + if (MI->getOperand(3).isImm() && + MI->getOperand(4).isImm()) + DecodeINSERTQIMask(MI->getOperand(3).getImm(), + MI->getOperand(4).getImm(), + ShuffleMask); + + DestName = getRegName(MI->getOperand(0).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + Src2Name = getRegName(MI->getOperand(2).getReg()); + break; + + case X86::PMOVZXBWrr: + case X86::PMOVZXBDrr: + case X86::PMOVZXBQrr: + case X86::PMOVZXWDrr: + case X86::PMOVZXWQrr: + case X86::PMOVZXDQrr: + case X86::VPMOVZXBWrr: + case X86::VPMOVZXBDrr: + case X86::VPMOVZXBQrr: + case X86::VPMOVZXWDrr: + case X86::VPMOVZXWQrr: + case X86::VPMOVZXDQrr: + case X86::VPMOVZXBWYrr: + case X86::VPMOVZXBDYrr: + case X86::VPMOVZXBQYrr: + case X86::VPMOVZXWDYrr: + case X86::VPMOVZXWQYrr: + case X86::VPMOVZXDQYrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::PMOVZXBWrm: + case X86::PMOVZXBDrm: + case X86::PMOVZXBQrm: + case X86::PMOVZXWDrm: + case X86::PMOVZXWQrm: + case X86::PMOVZXDQrm: + case X86::VPMOVZXBWrm: + case X86::VPMOVZXBDrm: + case X86::VPMOVZXBQrm: + case X86::VPMOVZXWDrm: + case X86::VPMOVZXWQrm: + case X86::VPMOVZXDQrm: + case X86::VPMOVZXBWYrm: + case X86::VPMOVZXBDYrm: + case X86::VPMOVZXBQYrm: + case X86::VPMOVZXWDYrm: + case X86::VPMOVZXWQYrm: + case X86::VPMOVZXDQYrm: { + MVT SrcVT, DstVT; + getZeroExtensionTypes(MI, SrcVT, DstVT); + DecodeZeroExtendMask(SrcVT, DstVT, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + } break; + } + + // The only comments we decode are shuffles, so give up if we were unable to + // decode a shuffle mask. + if (ShuffleMask.empty()) + return false; + + if (!DestName) DestName = Src1Name; + OS << (DestName ? DestName : "mem") << " = "; + + // If the two sources are the same, canonicalize the input elements to be + // from the first src so that we get larger element spans. + if (Src1Name == Src2Name) { + for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { + if ((int)ShuffleMask[i] >= 0 && // Not sentinel. + ShuffleMask[i] >= (int)e) // From second mask. + ShuffleMask[i] -= e; + } + } + + // The shuffle mask specifies which elements of the src1/src2 fill in the + // destination, with a few sentinel values. Loop through and print them + // out. + for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { + if (i != 0) + OS << ','; + if (ShuffleMask[i] == SM_SentinelZero) { + OS << "zero"; + continue; + } + + // Otherwise, it must come from src1 or src2. Print the span of elements + // that comes from this src. + bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size(); + const char *SrcName = isSrc1 ? Src1Name : Src2Name; + OS << (SrcName ? SrcName : "mem") << '['; + bool IsFirst = true; + while (i != e && (int)ShuffleMask[i] != SM_SentinelZero && + (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) { + if (!IsFirst) + OS << ','; + else + IsFirst = false; + if (ShuffleMask[i] == SM_SentinelUndef) + OS << "u"; + else + OS << ShuffleMask[i] % ShuffleMask.size(); + ++i; + } + OS << ']'; + --i; // For loop increments element #. + } + //MI->print(OS, 0); + OS << "\n"; + + // We successfully added a comment to this instruction. + return true; +} diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h new file mode 100644 index 0000000..687581b --- /dev/null +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h @@ -0,0 +1,25 @@ +//=- X86InstComments.h - Generate verbose-asm comments for instrs -*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This defines functionality used to emit comments about X86 instructions to +// an output stream for -fverbose-asm. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H +#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H + +namespace llvm { + class MCInst; + class raw_ostream; + bool EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, + const char *(*getRegName)(unsigned)); +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp new file mode 100644 index 0000000..879378f --- /dev/null +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -0,0 +1,257 @@ +//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file includes code for rendering MCInst instances as Intel-style +// assembly. +// +//===----------------------------------------------------------------------===// + +#include "X86IntelInstPrinter.h" +#include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "X86InstComments.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormattedStream.h" +#include <cctype> +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +#include "X86GenAsmWriter1.inc" + +void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << getRegisterName(RegNo); +} + +void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annot, + const MCSubtargetInfo &STI) { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + uint64_t TSFlags = Desc.TSFlags; + + if (TSFlags & X86II::LOCK) + OS << "\tlock\n"; + + printInstruction(MI, OS); + + // Next always print the annotation. + printAnnotation(OS, Annot); + + // If verbose assembly is enabled, we can print some informative comments. + if (CommentStream) + EmitAnyX86InstComments(MI, *CommentStream, getRegisterName); +} + +void X86IntelInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); + switch (Imm) { + default: llvm_unreachable("Invalid avxcc argument!"); + case 0: O << "eq"; break; + case 1: O << "lt"; break; + case 2: O << "le"; break; + case 3: O << "unord"; break; + case 4: O << "neq"; break; + case 5: O << "nlt"; break; + case 6: O << "nle"; break; + case 7: O << "ord"; break; + case 8: O << "eq_uq"; break; + case 9: O << "nge"; break; + case 0xa: O << "ngt"; break; + case 0xb: O << "false"; break; + case 0xc: O << "neq_oq"; break; + case 0xd: O << "ge"; break; + case 0xe: O << "gt"; break; + case 0xf: O << "true"; break; + case 0x10: O << "eq_os"; break; + case 0x11: O << "lt_oq"; break; + case 0x12: O << "le_oq"; break; + case 0x13: O << "unord_s"; break; + case 0x14: O << "neq_us"; break; + case 0x15: O << "nlt_uq"; break; + case 0x16: O << "nle_uq"; break; + case 0x17: O << "ord_s"; break; + case 0x18: O << "eq_us"; break; + case 0x19: O << "nge_uq"; break; + case 0x1a: O << "ngt_uq"; break; + case 0x1b: O << "false_os"; break; + case 0x1c: O << "neq_os"; break; + case 0x1d: O << "ge_oq"; break; + case 0x1e: O << "gt_oq"; break; + case 0x1f: O << "true_us"; break; + } +} + +void X86IntelInstPrinter::printXOPCC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); + switch (Imm) { + default: llvm_unreachable("Invalid xopcc argument!"); + case 0: O << "lt"; break; + case 1: O << "le"; break; + case 2: O << "gt"; break; + case 3: O << "ge"; break; + case 4: O << "eq"; break; + case 5: O << "neq"; break; + case 6: O << "false"; break; + case 7: O << "true"; break; + } +} + +void X86IntelInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm() & 0x3; + switch (Imm) { + case 0: O << "{rn-sae}"; break; + case 1: O << "{rd-sae}"; break; + case 2: O << "{ru-sae}"; break; + case 3: O << "{rz-sae}"; break; + } +} + +/// printPCRelImm - This is used to print an immediate value that ends up +/// being encoded as a pc-relative value. +void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isImm()) + O << formatImm(Op.getImm()); + else { + assert(Op.isExpr() && "unknown pcrel immediate operand"); + // If a symbolic branch target was added as a constant expression then print + // that address in hex. + const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr()); + int64_t Address; + if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) { + O << formatHex((uint64_t)Address); + } + else { + // Otherwise, just print the expression. + Op.getExpr()->print(O, &MAI); + } + } +} + +void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + printRegName(O, Op.getReg()); + } else if (Op.isImm()) { + O << formatImm((int64_t)Op.getImm()); + } else { + assert(Op.isExpr() && "unknown operand kind in printOperand"); + Op.getExpr()->print(O, &MAI); + } +} + +void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg); + unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm(); + const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg); + const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp); + const MCOperand &SegReg = MI->getOperand(Op+X86::AddrSegmentReg); + + // If this has a segment register, print it. + if (SegReg.getReg()) { + printOperand(MI, Op+X86::AddrSegmentReg, O); + O << ':'; + } + + O << '['; + + bool NeedPlus = false; + if (BaseReg.getReg()) { + printOperand(MI, Op+X86::AddrBaseReg, O); + NeedPlus = true; + } + + if (IndexReg.getReg()) { + if (NeedPlus) O << " + "; + if (ScaleVal != 1) + O << ScaleVal << '*'; + printOperand(MI, Op+X86::AddrIndexReg, O); + NeedPlus = true; + } + + if (!DispSpec.isImm()) { + if (NeedPlus) O << " + "; + assert(DispSpec.isExpr() && "non-immediate displacement for LEA?"); + DispSpec.getExpr()->print(O, &MAI); + } else { + int64_t DispVal = DispSpec.getImm(); + if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) { + if (NeedPlus) { + if (DispVal > 0) + O << " + "; + else { + O << " - "; + DispVal = -DispVal; + } + } + O << formatImm(DispVal); + } + } + + O << ']'; +} + +void X86IntelInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &SegReg = MI->getOperand(Op+1); + + // If this has a segment register, print it. + if (SegReg.getReg()) { + printOperand(MI, Op+1, O); + O << ':'; + } + O << '['; + printOperand(MI, Op, O); + O << ']'; +} + +void X86IntelInstPrinter::printDstIdx(const MCInst *MI, unsigned Op, + raw_ostream &O) { + // DI accesses are always ES-based. + O << "es:["; + printOperand(MI, Op, O); + O << ']'; +} + +void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &DispSpec = MI->getOperand(Op); + const MCOperand &SegReg = MI->getOperand(Op+1); + + // If this has a segment register, print it. + if (SegReg.getReg()) { + printOperand(MI, Op+1, O); + O << ':'; + } + + O << '['; + + if (DispSpec.isImm()) { + O << formatImm(DispSpec.getImm()); + } else { + assert(DispSpec.isExpr() && "non-immediate displacement?"); + DispSpec.getExpr()->print(O, &MAI); + } + + O << ']'; +} + +void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op, + raw_ostream &O) { + O << formatImm(MI->getOperand(Op).getImm() & 0xff); +} diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h new file mode 100644 index 0000000..20cd7ff --- /dev/null +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -0,0 +1,162 @@ +//= X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an X86 MCInst to Intel style .s file syntax. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H +#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H + +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +class X86IntelInstPrinter final : public MCInstPrinter { +public: + X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, + const MCSubtargetInfo &STI) override; + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O); + void printSSEAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O); + void printXOPCC(const MCInst *MI, unsigned Op, raw_ostream &O); + void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSrcIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O); + + void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + + void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "opaque ptr "; + printMemReference(MI, OpNo, O); + } + + void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "byte ptr "; + printMemReference(MI, OpNo, O); + } + void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "word ptr "; + printMemReference(MI, OpNo, O); + } + void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "dword ptr "; + printMemReference(MI, OpNo, O); + } + void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "qword ptr "; + printMemReference(MI, OpNo, O); + } + void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "xmmword ptr "; + printMemReference(MI, OpNo, O); + } + void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "ymmword ptr "; + printMemReference(MI, OpNo, O); + } + void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "zmmword ptr "; + printMemReference(MI, OpNo, O); + } + void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "dword ptr "; + printMemReference(MI, OpNo, O); + } + void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "qword ptr "; + printMemReference(MI, OpNo, O); + } + void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "xword ptr "; + printMemReference(MI, OpNo, O); + } + void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "xmmword ptr "; + printMemReference(MI, OpNo, O); + } + void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "ymmword ptr "; + printMemReference(MI, OpNo, O); + } + void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "zmmword ptr "; + printMemReference(MI, OpNo, O); + } + + + void printSrcIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "byte ptr "; + printSrcIdx(MI, OpNo, O); + } + void printSrcIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "word ptr "; + printSrcIdx(MI, OpNo, O); + } + void printSrcIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "dword ptr "; + printSrcIdx(MI, OpNo, O); + } + void printSrcIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "qword ptr "; + printSrcIdx(MI, OpNo, O); + } + void printDstIdx8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "byte ptr "; + printDstIdx(MI, OpNo, O); + } + void printDstIdx16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "word ptr "; + printDstIdx(MI, OpNo, O); + } + void printDstIdx32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "dword ptr "; + printDstIdx(MI, OpNo, O); + } + void printDstIdx64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "qword ptr "; + printDstIdx(MI, OpNo, O); + } + void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "byte ptr "; + printMemOffset(MI, OpNo, O); + } + void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "word ptr "; + printMemOffset(MI, OpNo, O); + } + void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "dword ptr "; + printMemOffset(MI, OpNo, O); + } + void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "qword ptr "; + printMemOffset(MI, OpNo, O); + } +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp new file mode 100644 index 0000000..133bd0e --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -0,0 +1,855 @@ +//===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86FixupKinds.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSectionCOFF.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MachO.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +static unsigned getFixupKindLog2Size(unsigned Kind) { + switch (Kind) { + default: + llvm_unreachable("invalid fixup kind!"); + case FK_PCRel_1: + case FK_SecRel_1: + case FK_Data_1: + return 0; + case FK_PCRel_2: + case FK_SecRel_2: + case FK_Data_2: + return 1; + case FK_PCRel_4: + case X86::reloc_riprel_4byte: + case X86::reloc_riprel_4byte_movq_load: + case X86::reloc_signed_4byte: + case X86::reloc_global_offset_table: + case FK_SecRel_4: + case FK_Data_4: + return 2; + case FK_PCRel_8: + case FK_SecRel_8: + case FK_Data_8: + case X86::reloc_global_offset_table8: + return 3; + } +} + +namespace { + +class X86ELFObjectWriter : public MCELFObjectTargetWriter { +public: + X86ELFObjectWriter(bool is64Bit, uint8_t OSABI, uint16_t EMachine, + bool HasRelocationAddend, bool foobar) + : MCELFObjectTargetWriter(is64Bit, OSABI, EMachine, HasRelocationAddend) {} +}; + +class X86AsmBackend : public MCAsmBackend { + const StringRef CPU; + bool HasNopl; + uint64_t MaxNopLength; +public: + X86AsmBackend(const Target &T, StringRef CPU) : MCAsmBackend(), CPU(CPU) { + HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" && + CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" && + CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" && + CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" && + CPU != "c3" && CPU != "c3-2"; + // Max length of true long nop instruction is 15 bytes. + // Max length of long nop replacement instruction is 7 bytes. + // Taking into account SilverMont architecture features max length of nops + // is reduced for it to achieve better performance. + MaxNopLength = (!HasNopl || CPU == "slm") ? 7 : 15; + } + + unsigned getNumFixupKinds() const override { + return X86::NumTargetFixupKinds; + } + + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override { + const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = { + { "reloc_riprel_4byte", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel }, + { "reloc_riprel_4byte_movq_load", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel}, + { "reloc_signed_4byte", 0, 4 * 8, 0}, + { "reloc_global_offset_table", 0, 4 * 8, 0} + }; + + if (Kind < FirstTargetFixupKind) + return MCAsmBackend::getFixupKindInfo(Kind); + + assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() && + "Invalid kind!"); + return Infos[Kind - FirstTargetFixupKind]; + } + + void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value, bool IsPCRel) const override { + unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind()); + + assert(Fixup.getOffset() + Size <= DataSize && + "Invalid fixup offset!"); + + // Check that uppper bits are either all zeros or all ones. + // Specifically ignore overflow/underflow as long as the leakage is + // limited to the lower bits. This is to remain compatible with + // other assemblers. + assert(isIntN(Size * 8 + 1, Value) && + "Value does not fit in the Fixup field"); + + for (unsigned i = 0; i != Size; ++i) + Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8)); + } + + bool mayNeedRelaxation(const MCInst &Inst) const override; + + bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const override; + + void relaxInstruction(const MCInst &Inst, MCInst &Res) const override; + + bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; +}; +} // end anonymous namespace + +static unsigned getRelaxedOpcodeBranch(unsigned Op) { + switch (Op) { + default: + return Op; + + case X86::JAE_1: return X86::JAE_4; + case X86::JA_1: return X86::JA_4; + case X86::JBE_1: return X86::JBE_4; + case X86::JB_1: return X86::JB_4; + case X86::JE_1: return X86::JE_4; + case X86::JGE_1: return X86::JGE_4; + case X86::JG_1: return X86::JG_4; + case X86::JLE_1: return X86::JLE_4; + case X86::JL_1: return X86::JL_4; + case X86::JMP_1: return X86::JMP_4; + case X86::JNE_1: return X86::JNE_4; + case X86::JNO_1: return X86::JNO_4; + case X86::JNP_1: return X86::JNP_4; + case X86::JNS_1: return X86::JNS_4; + case X86::JO_1: return X86::JO_4; + case X86::JP_1: return X86::JP_4; + case X86::JS_1: return X86::JS_4; + } +} + +static unsigned getRelaxedOpcodeArith(unsigned Op) { + switch (Op) { + default: + return Op; + + // IMUL + case X86::IMUL16rri8: return X86::IMUL16rri; + case X86::IMUL16rmi8: return X86::IMUL16rmi; + case X86::IMUL32rri8: return X86::IMUL32rri; + case X86::IMUL32rmi8: return X86::IMUL32rmi; + case X86::IMUL64rri8: return X86::IMUL64rri32; + case X86::IMUL64rmi8: return X86::IMUL64rmi32; + + // AND + case X86::AND16ri8: return X86::AND16ri; + case X86::AND16mi8: return X86::AND16mi; + case X86::AND32ri8: return X86::AND32ri; + case X86::AND32mi8: return X86::AND32mi; + case X86::AND64ri8: return X86::AND64ri32; + case X86::AND64mi8: return X86::AND64mi32; + + // OR + case X86::OR16ri8: return X86::OR16ri; + case X86::OR16mi8: return X86::OR16mi; + case X86::OR32ri8: return X86::OR32ri; + case X86::OR32mi8: return X86::OR32mi; + case X86::OR64ri8: return X86::OR64ri32; + case X86::OR64mi8: return X86::OR64mi32; + + // XOR + case X86::XOR16ri8: return X86::XOR16ri; + case X86::XOR16mi8: return X86::XOR16mi; + case X86::XOR32ri8: return X86::XOR32ri; + case X86::XOR32mi8: return X86::XOR32mi; + case X86::XOR64ri8: return X86::XOR64ri32; + case X86::XOR64mi8: return X86::XOR64mi32; + + // ADD + case X86::ADD16ri8: return X86::ADD16ri; + case X86::ADD16mi8: return X86::ADD16mi; + case X86::ADD32ri8: return X86::ADD32ri; + case X86::ADD32mi8: return X86::ADD32mi; + case X86::ADD64ri8: return X86::ADD64ri32; + case X86::ADD64mi8: return X86::ADD64mi32; + + // ADC + case X86::ADC16ri8: return X86::ADC16ri; + case X86::ADC16mi8: return X86::ADC16mi; + case X86::ADC32ri8: return X86::ADC32ri; + case X86::ADC32mi8: return X86::ADC32mi; + case X86::ADC64ri8: return X86::ADC64ri32; + case X86::ADC64mi8: return X86::ADC64mi32; + + // SUB + case X86::SUB16ri8: return X86::SUB16ri; + case X86::SUB16mi8: return X86::SUB16mi; + case X86::SUB32ri8: return X86::SUB32ri; + case X86::SUB32mi8: return X86::SUB32mi; + case X86::SUB64ri8: return X86::SUB64ri32; + case X86::SUB64mi8: return X86::SUB64mi32; + + // SBB + case X86::SBB16ri8: return X86::SBB16ri; + case X86::SBB16mi8: return X86::SBB16mi; + case X86::SBB32ri8: return X86::SBB32ri; + case X86::SBB32mi8: return X86::SBB32mi; + case X86::SBB64ri8: return X86::SBB64ri32; + case X86::SBB64mi8: return X86::SBB64mi32; + + // CMP + case X86::CMP16ri8: return X86::CMP16ri; + case X86::CMP16mi8: return X86::CMP16mi; + case X86::CMP32ri8: return X86::CMP32ri; + case X86::CMP32mi8: return X86::CMP32mi; + case X86::CMP64ri8: return X86::CMP64ri32; + case X86::CMP64mi8: return X86::CMP64mi32; + + // PUSH + case X86::PUSH32i8: return X86::PUSHi32; + case X86::PUSH16i8: return X86::PUSHi16; + case X86::PUSH64i8: return X86::PUSH64i32; + } +} + +static unsigned getRelaxedOpcode(unsigned Op) { + unsigned R = getRelaxedOpcodeArith(Op); + if (R != Op) + return R; + return getRelaxedOpcodeBranch(Op); +} + +bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst) const { + // Branches can always be relaxed. + if (getRelaxedOpcodeBranch(Inst.getOpcode()) != Inst.getOpcode()) + return true; + + // Check if this instruction is ever relaxable. + if (getRelaxedOpcodeArith(Inst.getOpcode()) == Inst.getOpcode()) + return false; + + + // Check if the relaxable operand has an expression. For the current set of + // relaxable instructions, the relaxable operand is always the last operand. + unsigned RelaxableOp = Inst.getNumOperands() - 1; + if (Inst.getOperand(RelaxableOp).isExpr()) + return true; + + return false; +} + +bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, + uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const { + // Relax if the value is too big for a (signed) i8. + return int64_t(Value) != int64_t(int8_t(Value)); +} + +// FIXME: Can tblgen help at all here to verify there aren't other instructions +// we can relax? +void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const { + // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel. + unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode()); + + if (RelaxedOp == Inst.getOpcode()) { + SmallString<256> Tmp; + raw_svector_ostream OS(Tmp); + Inst.dump_pretty(OS); + OS << "\n"; + report_fatal_error("unexpected instruction to relax: " + OS.str()); + } + + Res = Inst; + Res.setOpcode(RelaxedOp); +} + +/// \brief Write a sequence of optimal nops to the output, covering \p Count +/// bytes. +/// \return - true on success, false on failure +bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { + static const uint8_t TrueNops[10][10] = { + // nop + {0x90}, + // xchg %ax,%ax + {0x66, 0x90}, + // nopl (%[re]ax) + {0x0f, 0x1f, 0x00}, + // nopl 0(%[re]ax) + {0x0f, 0x1f, 0x40, 0x00}, + // nopl 0(%[re]ax,%[re]ax,1) + {0x0f, 0x1f, 0x44, 0x00, 0x00}, + // nopw 0(%[re]ax,%[re]ax,1) + {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00}, + // nopl 0L(%[re]ax) + {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00}, + // nopl 0L(%[re]ax,%[re]ax,1) + {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, + // nopw 0L(%[re]ax,%[re]ax,1) + {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, + // nopw %cs:0L(%[re]ax,%[re]ax,1) + {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, + }; + + // Alternative nop instructions for CPUs which don't support long nops. + static const uint8_t AltNops[7][10] = { + // nop + {0x90}, + // xchg %ax,%ax + {0x66, 0x90}, + // lea 0x0(%esi),%esi + {0x8d, 0x76, 0x00}, + // lea 0x0(%esi),%esi + {0x8d, 0x74, 0x26, 0x00}, + // nop + lea 0x0(%esi),%esi + {0x90, 0x8d, 0x74, 0x26, 0x00}, + // lea 0x0(%esi),%esi + {0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00 }, + // lea 0x0(%esi),%esi + {0x8d, 0xb4, 0x26, 0x00, 0x00, 0x00, 0x00}, + }; + + // Select the right NOP table. + // FIXME: Can we get if CPU supports long nops from the subtarget somehow? + const uint8_t (*Nops)[10] = HasNopl ? TrueNops : AltNops; + assert(HasNopl || MaxNopLength <= 7); + + // Emit as many largest nops as needed, then emit a nop of the remaining + // length. + do { + const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength); + const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10; + for (uint8_t i = 0; i < Prefixes; i++) + OW->write8(0x66); + const uint8_t Rest = ThisNopLength - Prefixes; + for (uint8_t i = 0; i < Rest; i++) + OW->write8(Nops[Rest - 1][i]); + Count -= ThisNopLength; + } while (Count != 0); + + return true; +} + +/* *** */ + +namespace { + +class ELFX86AsmBackend : public X86AsmBackend { +public: + uint8_t OSABI; + ELFX86AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) + : X86AsmBackend(T, CPU), OSABI(OSABI) {} +}; + +class ELFX86_32AsmBackend : public ELFX86AsmBackend { +public: + ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) + : ELFX86AsmBackend(T, OSABI, CPU) {} + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386); + } +}; + +class ELFX86_X32AsmBackend : public ELFX86AsmBackend { +public: + ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) + : ELFX86AsmBackend(T, OSABI, CPU) {} + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, + ELF::EM_X86_64); + } +}; + +class ELFX86_IAMCUAsmBackend : public ELFX86AsmBackend { +public: + ELFX86_IAMCUAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) + : ELFX86AsmBackend(T, OSABI, CPU) {} + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, + ELF::EM_IAMCU); + } +}; + +class ELFX86_64AsmBackend : public ELFX86AsmBackend { +public: + ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) + : ELFX86AsmBackend(T, OSABI, CPU) {} + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64); + } +}; + +class WindowsX86AsmBackend : public X86AsmBackend { + bool Is64Bit; + +public: + WindowsX86AsmBackend(const Target &T, bool is64Bit, StringRef CPU) + : X86AsmBackend(T, CPU) + , Is64Bit(is64Bit) { + } + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createX86WinCOFFObjectWriter(OS, Is64Bit); + } +}; + +namespace CU { + + /// Compact unwind encoding values. + enum CompactUnwindEncodings { + /// [RE]BP based frame where [RE]BP is pused on the stack immediately after + /// the return address, then [RE]SP is moved to [RE]BP. + UNWIND_MODE_BP_FRAME = 0x01000000, + + /// A frameless function with a small constant stack size. + UNWIND_MODE_STACK_IMMD = 0x02000000, + + /// A frameless function with a large constant stack size. + UNWIND_MODE_STACK_IND = 0x03000000, + + /// No compact unwind encoding is available. + UNWIND_MODE_DWARF = 0x04000000, + + /// Mask for encoding the frame registers. + UNWIND_BP_FRAME_REGISTERS = 0x00007FFF, + + /// Mask for encoding the frameless registers. + UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF + }; + +} // end CU namespace + +class DarwinX86AsmBackend : public X86AsmBackend { + const MCRegisterInfo &MRI; + + /// \brief Number of registers that can be saved in a compact unwind encoding. + enum { CU_NUM_SAVED_REGS = 6 }; + + mutable unsigned SavedRegs[CU_NUM_SAVED_REGS]; + bool Is64Bit; + + unsigned OffsetSize; ///< Offset of a "push" instruction. + unsigned MoveInstrSize; ///< Size of a "move" instruction. + unsigned StackDivide; ///< Amount to adjust stack size by. +protected: + /// \brief Size of a "push" instruction for the given register. + unsigned PushInstrSize(unsigned Reg) const { + switch (Reg) { + case X86::EBX: + case X86::ECX: + case X86::EDX: + case X86::EDI: + case X86::ESI: + case X86::EBP: + case X86::RBX: + case X86::RBP: + return 1; + case X86::R12: + case X86::R13: + case X86::R14: + case X86::R15: + return 2; + } + return 1; + } + + /// \brief Implementation of algorithm to generate the compact unwind encoding + /// for the CFI instructions. + uint32_t + generateCompactUnwindEncodingImpl(ArrayRef<MCCFIInstruction> Instrs) const { + if (Instrs.empty()) return 0; + + // Reset the saved registers. + unsigned SavedRegIdx = 0; + memset(SavedRegs, 0, sizeof(SavedRegs)); + + bool HasFP = false; + + // Encode that we are using EBP/RBP as the frame pointer. + uint32_t CompactUnwindEncoding = 0; + + unsigned SubtractInstrIdx = Is64Bit ? 3 : 2; + unsigned InstrOffset = 0; + unsigned StackAdjust = 0; + unsigned StackSize = 0; + unsigned PrevStackSize = 0; + unsigned NumDefCFAOffsets = 0; + + for (unsigned i = 0, e = Instrs.size(); i != e; ++i) { + const MCCFIInstruction &Inst = Instrs[i]; + + switch (Inst.getOperation()) { + default: + // Any other CFI directives indicate a frame that we aren't prepared + // to represent via compact unwind, so just bail out. + return 0; + case MCCFIInstruction::OpDefCfaRegister: { + // Defines a frame pointer. E.g. + // + // movq %rsp, %rbp + // L0: + // .cfi_def_cfa_register %rbp + // + HasFP = true; + assert(MRI.getLLVMRegNum(Inst.getRegister(), true) == + (Is64Bit ? X86::RBP : X86::EBP) && "Invalid frame pointer!"); + + // Reset the counts. + memset(SavedRegs, 0, sizeof(SavedRegs)); + StackAdjust = 0; + SavedRegIdx = 0; + InstrOffset += MoveInstrSize; + break; + } + case MCCFIInstruction::OpDefCfaOffset: { + // Defines a new offset for the CFA. E.g. + // + // With frame: + // + // pushq %rbp + // L0: + // .cfi_def_cfa_offset 16 + // + // Without frame: + // + // subq $72, %rsp + // L0: + // .cfi_def_cfa_offset 80 + // + PrevStackSize = StackSize; + StackSize = std::abs(Inst.getOffset()) / StackDivide; + ++NumDefCFAOffsets; + break; + } + case MCCFIInstruction::OpOffset: { + // Defines a "push" of a callee-saved register. E.g. + // + // pushq %r15 + // pushq %r14 + // pushq %rbx + // L0: + // subq $120, %rsp + // L1: + // .cfi_offset %rbx, -40 + // .cfi_offset %r14, -32 + // .cfi_offset %r15, -24 + // + if (SavedRegIdx == CU_NUM_SAVED_REGS) + // If there are too many saved registers, we cannot use a compact + // unwind encoding. + return CU::UNWIND_MODE_DWARF; + + unsigned Reg = MRI.getLLVMRegNum(Inst.getRegister(), true); + SavedRegs[SavedRegIdx++] = Reg; + StackAdjust += OffsetSize; + InstrOffset += PushInstrSize(Reg); + break; + } + } + } + + StackAdjust /= StackDivide; + + if (HasFP) { + if ((StackAdjust & 0xFF) != StackAdjust) + // Offset was too big for a compact unwind encoding. + return CU::UNWIND_MODE_DWARF; + + // Get the encoding of the saved registers when we have a frame pointer. + uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame(); + if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF; + + CompactUnwindEncoding |= CU::UNWIND_MODE_BP_FRAME; + CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16; + CompactUnwindEncoding |= RegEnc & CU::UNWIND_BP_FRAME_REGISTERS; + } else { + // If the amount of the stack allocation is the size of a register, then + // we "push" the RAX/EAX register onto the stack instead of adjusting the + // stack pointer with a SUB instruction. We don't support the push of the + // RAX/EAX register with compact unwind. So we check for that situation + // here. + if ((NumDefCFAOffsets == SavedRegIdx + 1 && + StackSize - PrevStackSize == 1) || + (Instrs.size() == 1 && NumDefCFAOffsets == 1 && StackSize == 2)) + return CU::UNWIND_MODE_DWARF; + + SubtractInstrIdx += InstrOffset; + ++StackAdjust; + + if ((StackSize & 0xFF) == StackSize) { + // Frameless stack with a small stack size. + CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IMMD; + + // Encode the stack size. + CompactUnwindEncoding |= (StackSize & 0xFF) << 16; + } else { + if ((StackAdjust & 0x7) != StackAdjust) + // The extra stack adjustments are too big for us to handle. + return CU::UNWIND_MODE_DWARF; + + // Frameless stack with an offset too large for us to encode compactly. + CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IND; + + // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP' + // instruction. + CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16; + + // Encode any extra stack stack adjustments (done via push + // instructions). + CompactUnwindEncoding |= (StackAdjust & 0x7) << 13; + } + + // Encode the number of registers saved. (Reverse the list first.) + std::reverse(&SavedRegs[0], &SavedRegs[SavedRegIdx]); + CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10; + + // Get the encoding of the saved registers when we don't have a frame + // pointer. + uint32_t RegEnc = encodeCompactUnwindRegistersWithoutFrame(SavedRegIdx); + if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF; + + // Encode the register encoding. + CompactUnwindEncoding |= + RegEnc & CU::UNWIND_FRAMELESS_STACK_REG_PERMUTATION; + } + + return CompactUnwindEncoding; + } + +private: + /// \brief Get the compact unwind number for a given register. The number + /// corresponds to the enum lists in compact_unwind_encoding.h. + int getCompactUnwindRegNum(unsigned Reg) const { + static const MCPhysReg CU32BitRegs[7] = { + X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 + }; + static const MCPhysReg CU64BitRegs[] = { + X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 + }; + const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs; + for (int Idx = 1; *CURegs; ++CURegs, ++Idx) + if (*CURegs == Reg) + return Idx; + + return -1; + } + + /// \brief Return the registers encoded for a compact encoding with a frame + /// pointer. + uint32_t encodeCompactUnwindRegistersWithFrame() const { + // Encode the registers in the order they were saved --- 3-bits per + // register. The list of saved registers is assumed to be in reverse + // order. The registers are numbered from 1 to CU_NUM_SAVED_REGS. + uint32_t RegEnc = 0; + for (int i = 0, Idx = 0; i != CU_NUM_SAVED_REGS; ++i) { + unsigned Reg = SavedRegs[i]; + if (Reg == 0) break; + + int CURegNum = getCompactUnwindRegNum(Reg); + if (CURegNum == -1) return ~0U; + + // Encode the 3-bit register number in order, skipping over 3-bits for + // each register. + RegEnc |= (CURegNum & 0x7) << (Idx++ * 3); + } + + assert((RegEnc & 0x3FFFF) == RegEnc && + "Invalid compact register encoding!"); + return RegEnc; + } + + /// \brief Create the permutation encoding used with frameless stacks. It is + /// passed the number of registers to be saved and an array of the registers + /// saved. + uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const { + // The saved registers are numbered from 1 to 6. In order to encode the + // order in which they were saved, we re-number them according to their + // place in the register order. The re-numbering is relative to the last + // re-numbered register. E.g., if we have registers {6, 2, 4, 5} saved in + // that order: + // + // Orig Re-Num + // ---- ------ + // 6 6 + // 2 2 + // 4 3 + // 5 3 + // + for (unsigned i = 0; i < RegCount; ++i) { + int CUReg = getCompactUnwindRegNum(SavedRegs[i]); + if (CUReg == -1) return ~0U; + SavedRegs[i] = CUReg; + } + + // Reverse the list. + std::reverse(&SavedRegs[0], &SavedRegs[CU_NUM_SAVED_REGS]); + + uint32_t RenumRegs[CU_NUM_SAVED_REGS]; + for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i){ + unsigned Countless = 0; + for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j) + if (SavedRegs[j] < SavedRegs[i]) + ++Countless; + + RenumRegs[i] = SavedRegs[i] - Countless - 1; + } + + // Take the renumbered values and encode them into a 10-bit number. + uint32_t permutationEncoding = 0; + switch (RegCount) { + case 6: + permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1] + + 6 * RenumRegs[2] + 2 * RenumRegs[3] + + RenumRegs[4]; + break; + case 5: + permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2] + + 6 * RenumRegs[3] + 2 * RenumRegs[4] + + RenumRegs[5]; + break; + case 4: + permutationEncoding |= 60 * RenumRegs[2] + 12 * RenumRegs[3] + + 3 * RenumRegs[4] + RenumRegs[5]; + break; + case 3: + permutationEncoding |= 20 * RenumRegs[3] + 4 * RenumRegs[4] + + RenumRegs[5]; + break; + case 2: + permutationEncoding |= 5 * RenumRegs[4] + RenumRegs[5]; + break; + case 1: + permutationEncoding |= RenumRegs[5]; + break; + } + + assert((permutationEncoding & 0x3FF) == permutationEncoding && + "Invalid compact register encoding!"); + return permutationEncoding; + } + +public: + DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef CPU, + bool Is64Bit) + : X86AsmBackend(T, CPU), MRI(MRI), Is64Bit(Is64Bit) { + memset(SavedRegs, 0, sizeof(SavedRegs)); + OffsetSize = Is64Bit ? 8 : 4; + MoveInstrSize = Is64Bit ? 3 : 2; + StackDivide = Is64Bit ? 8 : 4; + } +}; + +class DarwinX86_32AsmBackend : public DarwinX86AsmBackend { +public: + DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, + StringRef CPU) + : DarwinX86AsmBackend(T, MRI, CPU, false) {} + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createX86MachObjectWriter(OS, /*Is64Bit=*/false, + MachO::CPU_TYPE_I386, + MachO::CPU_SUBTYPE_I386_ALL); + } + + /// \brief Generate the compact unwind encoding for the CFI instructions. + uint32_t generateCompactUnwindEncoding( + ArrayRef<MCCFIInstruction> Instrs) const override { + return generateCompactUnwindEncodingImpl(Instrs); + } +}; + +class DarwinX86_64AsmBackend : public DarwinX86AsmBackend { + const MachO::CPUSubTypeX86 Subtype; +public: + DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, + StringRef CPU, MachO::CPUSubTypeX86 st) + : DarwinX86AsmBackend(T, MRI, CPU, true), Subtype(st) {} + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createX86MachObjectWriter(OS, /*Is64Bit=*/true, + MachO::CPU_TYPE_X86_64, Subtype); + } + + /// \brief Generate the compact unwind encoding for the CFI instructions. + uint32_t generateCompactUnwindEncoding( + ArrayRef<MCCFIInstruction> Instrs) const override { + return generateCompactUnwindEncodingImpl(Instrs); + } +}; + +} // end anonymous namespace + +MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TheTriple, + StringRef CPU) { + if (TheTriple.isOSBinFormatMachO()) + return new DarwinX86_32AsmBackend(T, MRI, CPU); + + if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF()) + return new WindowsX86AsmBackend(T, false, CPU); + + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); + + if (TheTriple.isOSIAMCU()) + return new ELFX86_IAMCUAsmBackend(T, OSABI, CPU); + + return new ELFX86_32AsmBackend(T, OSABI, CPU); +} + +MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TheTriple, + StringRef CPU) { + if (TheTriple.isOSBinFormatMachO()) { + MachO::CPUSubTypeX86 CS = + StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName()) + .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H) + .Default(MachO::CPU_SUBTYPE_X86_64_ALL); + return new DarwinX86_64AsmBackend(T, MRI, CPU, CS); + } + + if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF()) + return new WindowsX86AsmBackend(T, true, CPU); + + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); + + if (TheTriple.getEnvironment() == Triple::GNUX32) + return new ELFX86_X32AsmBackend(T, OSABI, CPU); + return new ELFX86_64AsmBackend(T, OSABI, CPU); +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h new file mode 100644 index 0000000..9ff85b9 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -0,0 +1,779 @@ +//===-- X86BaseInfo.h - Top level definitions for X86 -------- --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains small standalone helper functions and enum definitions for +// the X86 target useful for the compiler back-end and the MC libraries. +// As such, it deliberately does not include references to LLVM core +// code gen types, passes, etc.. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86BASEINFO_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86BASEINFO_H + +#include "X86MCTargetDesc.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/DataTypes.h" +#include "llvm/Support/ErrorHandling.h" + +namespace llvm { + +namespace X86 { + // Enums for memory operand decoding. Each memory operand is represented with + // a 5 operand sequence in the form: + // [BaseReg, ScaleAmt, IndexReg, Disp, Segment] + // These enums help decode this. + enum { + AddrBaseReg = 0, + AddrScaleAmt = 1, + AddrIndexReg = 2, + AddrDisp = 3, + + /// AddrSegmentReg - The operand # of the segment in the memory operand. + AddrSegmentReg = 4, + + /// AddrNumOperands - Total number of operands in a memory reference. + AddrNumOperands = 5 + }; + + /// AVX512 static rounding constants. These need to match the values in + /// avx512fintrin.h. + enum STATIC_ROUNDING { + TO_NEAREST_INT = 0, + TO_NEG_INF = 1, + TO_POS_INF = 2, + TO_ZERO = 3, + CUR_DIRECTION = 4 + }; +} // end namespace X86; + +/// X86II - This namespace holds all of the target specific flags that +/// instruction info tracks. +/// +namespace X86II { + /// Target Operand Flag enum. + enum TOF { + //===------------------------------------------------------------------===// + // X86 Specific MachineOperand flags. + + MO_NO_FLAG, + + /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a + /// relocation of: + /// SYMBOL_LABEL + [. - PICBASELABEL] + MO_GOT_ABSOLUTE_ADDRESS, + + /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the + /// immediate should get the value of the symbol minus the PIC base label: + /// SYMBOL_LABEL - PICBASELABEL + MO_PIC_BASE_OFFSET, + + /// MO_GOT - On a symbol operand this indicates that the immediate is the + /// offset to the GOT entry for the symbol name from the base of the GOT. + /// + /// See the X86-64 ELF ABI supplement for more details. + /// SYMBOL_LABEL @GOT + MO_GOT, + + /// MO_GOTOFF - On a symbol operand this indicates that the immediate is + /// the offset to the location of the symbol name from the base of the GOT. + /// + /// See the X86-64 ELF ABI supplement for more details. + /// SYMBOL_LABEL @GOTOFF + MO_GOTOFF, + + /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is + /// offset to the GOT entry for the symbol name from the current code + /// location. + /// + /// See the X86-64 ELF ABI supplement for more details. + /// SYMBOL_LABEL @GOTPCREL + MO_GOTPCREL, + + /// MO_PLT - On a symbol operand this indicates that the immediate is + /// offset to the PLT entry of symbol name from the current code location. + /// + /// See the X86-64 ELF ABI supplement for more details. + /// SYMBOL_LABEL @PLT + MO_PLT, + + /// MO_TLSGD - On a symbol operand this indicates that the immediate is + /// the offset of the GOT entry with the TLS index structure that contains + /// the module number and variable offset for the symbol. Used in the + /// general dynamic TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @TLSGD + MO_TLSGD, + + /// MO_TLSLD - On a symbol operand this indicates that the immediate is + /// the offset of the GOT entry with the TLS index for the module that + /// contains the symbol. When this index is passed to a call to + /// __tls_get_addr, the function will return the base address of the TLS + /// block for the symbol. Used in the x86-64 local dynamic TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @TLSLD + MO_TLSLD, + + /// MO_TLSLDM - On a symbol operand this indicates that the immediate is + /// the offset of the GOT entry with the TLS index for the module that + /// contains the symbol. When this index is passed to a call to + /// ___tls_get_addr, the function will return the base address of the TLS + /// block for the symbol. Used in the IA32 local dynamic TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @TLSLDM + MO_TLSLDM, + + /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is + /// the offset of the GOT entry with the thread-pointer offset for the + /// symbol. Used in the x86-64 initial exec TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @GOTTPOFF + MO_GOTTPOFF, + + /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is + /// the absolute address of the GOT entry with the negative thread-pointer + /// offset for the symbol. Used in the non-PIC IA32 initial exec TLS access + /// model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @INDNTPOFF + MO_INDNTPOFF, + + /// MO_TPOFF - On a symbol operand this indicates that the immediate is + /// the thread-pointer offset for the symbol. Used in the x86-64 local + /// exec TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @TPOFF + MO_TPOFF, + + /// MO_DTPOFF - On a symbol operand this indicates that the immediate is + /// the offset of the GOT entry with the TLS offset of the symbol. Used + /// in the local dynamic TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @DTPOFF + MO_DTPOFF, + + /// MO_NTPOFF - On a symbol operand this indicates that the immediate is + /// the negative thread-pointer offset for the symbol. Used in the IA32 + /// local exec TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @NTPOFF + MO_NTPOFF, + + /// MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is + /// the offset of the GOT entry with the negative thread-pointer offset for + /// the symbol. Used in the PIC IA32 initial exec TLS access model. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @GOTNTPOFF + MO_GOTNTPOFF, + + /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the + /// reference is actually to the "__imp_FOO" symbol. This is used for + /// dllimport linkage on windows. + MO_DLLIMPORT, + + /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the + /// reference is actually to the "FOO$stub" symbol. This is used for calls + /// and jumps to external functions on Tiger and earlier. + MO_DARWIN_STUB, + + /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the + /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a + /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub. + MO_DARWIN_NONLAZY, + + /// MO_DARWIN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this indicates + /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is + /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub. + MO_DARWIN_NONLAZY_PIC_BASE, + + /// MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this + /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE", + /// which is a PIC-base-relative reference to a hidden dyld lazy pointer + /// stub. + MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE, + + /// MO_TLVP - On a symbol operand this indicates that the immediate is + /// some TLS offset. + /// + /// This is the TLS offset for the Darwin TLS mechanism. + MO_TLVP, + + /// MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate + /// is some TLS offset from the picbase. + /// + /// This is the 32-bit TLS offset for Darwin TLS in PIC mode. + MO_TLVP_PIC_BASE, + + /// MO_SECREL - On a symbol operand this indicates that the immediate is + /// the offset from beginning of section. + /// + /// This is the TLS offset for the COFF/Windows TLS mechanism. + MO_SECREL + }; + + enum : uint64_t { + //===------------------------------------------------------------------===// + // Instruction encodings. These are the standard/most common forms for X86 + // instructions. + // + + // PseudoFrm - This represents an instruction that is a pseudo instruction + // or one that has not been implemented yet. It is illegal to code generate + // it, but tolerated for intermediate implementation stages. + Pseudo = 0, + + /// Raw - This form is for instructions that don't have any operands, so + /// they are just a fixed opcode value, like 'leave'. + RawFrm = 1, + + /// AddRegFrm - This form is used for instructions like 'push r32' that have + /// their one register operand added to their opcode. + AddRegFrm = 2, + + /// MRMDestReg - This form is used for instructions that use the Mod/RM byte + /// to specify a destination, which in this case is a register. + /// + MRMDestReg = 3, + + /// MRMDestMem - This form is used for instructions that use the Mod/RM byte + /// to specify a destination, which in this case is memory. + /// + MRMDestMem = 4, + + /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte + /// to specify a source, which in this case is a register. + /// + MRMSrcReg = 5, + + /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte + /// to specify a source, which in this case is memory. + /// + MRMSrcMem = 6, + + /// RawFrmMemOffs - This form is for instructions that store an absolute + /// memory offset as an immediate with a possible segment override. + RawFrmMemOffs = 7, + + /// RawFrmSrc - This form is for instructions that use the source index + /// register SI/ESI/RSI with a possible segment override. + RawFrmSrc = 8, + + /// RawFrmDst - This form is for instructions that use the destination index + /// register DI/EDI/ESI. + RawFrmDst = 9, + + /// RawFrmSrc - This form is for instructions that use the source index + /// register SI/ESI/ERI with a possible segment override, and also the + /// destination index register DI/ESI/RDI. + RawFrmDstSrc = 10, + + /// RawFrmImm8 - This is used for the ENTER instruction, which has two + /// immediates, the first of which is a 16-bit immediate (specified by + /// the imm encoding) and the second is a 8-bit fixed value. + RawFrmImm8 = 11, + + /// RawFrmImm16 - This is used for CALL FAR instructions, which have two + /// immediates, the first of which is a 16 or 32-bit immediate (specified by + /// the imm encoding) and the second is a 16-bit fixed value. In the AMD + /// manual, this operand is described as pntr16:32 and pntr16:16 + RawFrmImm16 = 12, + + /// MRMX[rm] - The forms are used to represent instructions that use a + /// Mod/RM byte, and don't use the middle field for anything. + MRMXr = 14, MRMXm = 15, + + /// MRM[0-7][rm] - These forms are used to represent instructions that use + /// a Mod/RM byte, and use the middle field to hold extended opcode + /// information. In the intel manual these are represented as /0, /1, ... + /// + + // First, instructions that operate on a register r/m operand... + MRM0r = 16, MRM1r = 17, MRM2r = 18, MRM3r = 19, // Format /0 /1 /2 /3 + MRM4r = 20, MRM5r = 21, MRM6r = 22, MRM7r = 23, // Format /4 /5 /6 /7 + + // Next, instructions that operate on a memory r/m operand... + MRM0m = 24, MRM1m = 25, MRM2m = 26, MRM3m = 27, // Format /0 /1 /2 /3 + MRM4m = 28, MRM5m = 29, MRM6m = 30, MRM7m = 31, // Format /4 /5 /6 /7 + + //// MRM_XX - A mod/rm byte of exactly 0xXX. + MRM_C0 = 32, MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35, + MRM_C4 = 36, MRM_C5 = 37, MRM_C6 = 38, MRM_C7 = 39, + MRM_C8 = 40, MRM_C9 = 41, MRM_CA = 42, MRM_CB = 43, + MRM_CC = 44, MRM_CD = 45, MRM_CE = 46, MRM_CF = 47, + MRM_D0 = 48, MRM_D1 = 49, MRM_D2 = 50, MRM_D3 = 51, + MRM_D4 = 52, MRM_D5 = 53, MRM_D6 = 54, MRM_D7 = 55, + MRM_D8 = 56, MRM_D9 = 57, MRM_DA = 58, MRM_DB = 59, + MRM_DC = 60, MRM_DD = 61, MRM_DE = 62, MRM_DF = 63, + MRM_E0 = 64, MRM_E1 = 65, MRM_E2 = 66, MRM_E3 = 67, + MRM_E4 = 68, MRM_E5 = 69, MRM_E6 = 70, MRM_E7 = 71, + MRM_E8 = 72, MRM_E9 = 73, MRM_EA = 74, MRM_EB = 75, + MRM_EC = 76, MRM_ED = 77, MRM_EE = 78, MRM_EF = 79, + MRM_F0 = 80, MRM_F1 = 81, MRM_F2 = 82, MRM_F3 = 83, + MRM_F4 = 84, MRM_F5 = 85, MRM_F6 = 86, MRM_F7 = 87, + MRM_F8 = 88, MRM_F9 = 89, MRM_FA = 90, MRM_FB = 91, + MRM_FC = 92, MRM_FD = 93, MRM_FE = 94, MRM_FF = 95, + + FormMask = 127, + + //===------------------------------------------------------------------===// + // Actual flags... + + // OpSize - OpSizeFixed implies instruction never needs a 0x66 prefix. + // OpSize16 means this is a 16-bit instruction and needs 0x66 prefix in + // 32-bit mode. OpSize32 means this is a 32-bit instruction needs a 0x66 + // prefix in 16-bit mode. + OpSizeShift = 7, + OpSizeMask = 0x3 << OpSizeShift, + + OpSizeFixed = 0 << OpSizeShift, + OpSize16 = 1 << OpSizeShift, + OpSize32 = 2 << OpSizeShift, + + // AsSize - AdSizeX implies this instruction determines its need of 0x67 + // prefix from a normal ModRM memory operand. The other types indicate that + // an operand is encoded with a specific width and a prefix is needed if + // it differs from the current mode. + AdSizeShift = OpSizeShift + 2, + AdSizeMask = 0x3 << AdSizeShift, + + AdSizeX = 1 << AdSizeShift, + AdSize16 = 1 << AdSizeShift, + AdSize32 = 2 << AdSizeShift, + AdSize64 = 3 << AdSizeShift, + + //===------------------------------------------------------------------===// + // OpPrefix - There are several prefix bytes that are used as opcode + // extensions. These are 0x66, 0xF3, and 0xF2. If this field is 0 there is + // no prefix. + // + OpPrefixShift = AdSizeShift + 2, + OpPrefixMask = 0x7 << OpPrefixShift, + + // PS, PD - Prefix code for packed single and double precision vector + // floating point operations performed in the SSE registers. + PS = 1 << OpPrefixShift, PD = 2 << OpPrefixShift, + + // XS, XD - These prefix codes are for single and double precision scalar + // floating point operations performed in the SSE registers. + XS = 3 << OpPrefixShift, XD = 4 << OpPrefixShift, + + //===------------------------------------------------------------------===// + // OpMap - This field determines which opcode map this instruction + // belongs to. i.e. one-byte, two-byte, 0x0f 0x38, 0x0f 0x3a, etc. + // + OpMapShift = OpPrefixShift + 3, + OpMapMask = 0x7 << OpMapShift, + + // OB - OneByte - Set if this instruction has a one byte opcode. + OB = 0 << OpMapShift, + + // TB - TwoByte - Set if this instruction has a two byte opcode, which + // starts with a 0x0F byte before the real opcode. + TB = 1 << OpMapShift, + + // T8, TA - Prefix after the 0x0F prefix. + T8 = 2 << OpMapShift, TA = 3 << OpMapShift, + + // XOP8 - Prefix to include use of imm byte. + XOP8 = 4 << OpMapShift, + + // XOP9 - Prefix to exclude use of imm byte. + XOP9 = 5 << OpMapShift, + + // XOPA - Prefix to encode 0xA in VEX.MMMM of XOP instructions. + XOPA = 6 << OpMapShift, + + //===------------------------------------------------------------------===// + // REX_W - REX prefixes are instruction prefixes used in 64-bit mode. + // They are used to specify GPRs and SSE registers, 64-bit operand size, + // etc. We only cares about REX.W and REX.R bits and only the former is + // statically determined. + // + REXShift = OpMapShift + 3, + REX_W = 1 << REXShift, + + //===------------------------------------------------------------------===// + // This three-bit field describes the size of an immediate operand. Zero is + // unused so that we can tell if we forgot to set a value. + ImmShift = REXShift + 1, + ImmMask = 15 << ImmShift, + Imm8 = 1 << ImmShift, + Imm8PCRel = 2 << ImmShift, + Imm16 = 3 << ImmShift, + Imm16PCRel = 4 << ImmShift, + Imm32 = 5 << ImmShift, + Imm32PCRel = 6 << ImmShift, + Imm32S = 7 << ImmShift, + Imm64 = 8 << ImmShift, + + //===------------------------------------------------------------------===// + // FP Instruction Classification... Zero is non-fp instruction. + + // FPTypeMask - Mask for all of the FP types... + FPTypeShift = ImmShift + 4, + FPTypeMask = 7 << FPTypeShift, + + // NotFP - The default, set for instructions that do not use FP registers. + NotFP = 0 << FPTypeShift, + + // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0 + ZeroArgFP = 1 << FPTypeShift, + + // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst + OneArgFP = 2 << FPTypeShift, + + // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a + // result back to ST(0). For example, fcos, fsqrt, etc. + // + OneArgFPRW = 3 << FPTypeShift, + + // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an + // explicit argument, storing the result to either ST(0) or the implicit + // argument. For example: fadd, fsub, fmul, etc... + TwoArgFP = 4 << FPTypeShift, + + // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an + // explicit argument, but have no destination. Example: fucom, fucomi, ... + CompareFP = 5 << FPTypeShift, + + // CondMovFP - "2 operand" floating point conditional move instructions. + CondMovFP = 6 << FPTypeShift, + + // SpecialFP - Special instruction forms. Dispatch by opcode explicitly. + SpecialFP = 7 << FPTypeShift, + + // Lock prefix + LOCKShift = FPTypeShift + 3, + LOCK = 1 << LOCKShift, + + // REP prefix + REPShift = LOCKShift + 1, + REP = 1 << REPShift, + + // Execution domain for SSE instructions. + // 0 means normal, non-SSE instruction. + SSEDomainShift = REPShift + 1, + + // Encoding + EncodingShift = SSEDomainShift + 2, + EncodingMask = 0x3 << EncodingShift, + + // VEX - encoding using 0xC4/0xC5 + VEX = 1 << EncodingShift, + + /// XOP - Opcode prefix used by XOP instructions. + XOP = 2 << EncodingShift, + + // VEX_EVEX - Specifies that this instruction use EVEX form which provides + // syntax support up to 32 512-bit register operands and up to 7 16-bit + // mask operands as well as source operand data swizzling/memory operand + // conversion, eviction hint, and rounding mode. + EVEX = 3 << EncodingShift, + + // Opcode + OpcodeShift = EncodingShift + 2, + + /// VEX_W - Has a opcode specific functionality, but is used in the same + /// way as REX_W is for regular SSE instructions. + VEX_WShift = OpcodeShift + 8, + VEX_W = 1ULL << VEX_WShift, + + /// VEX_4V - Used to specify an additional AVX/SSE register. Several 2 + /// address instructions in SSE are represented as 3 address ones in AVX + /// and the additional register is encoded in VEX_VVVV prefix. + VEX_4VShift = VEX_WShift + 1, + VEX_4V = 1ULL << VEX_4VShift, + + /// VEX_4VOp3 - Similar to VEX_4V, but used on instructions that encode + /// operand 3 with VEX.vvvv. + VEX_4VOp3Shift = VEX_4VShift + 1, + VEX_4VOp3 = 1ULL << VEX_4VOp3Shift, + + /// VEX_I8IMM - Specifies that the last register used in a AVX instruction, + /// must be encoded in the i8 immediate field. This usually happens in + /// instructions with 4 operands. + VEX_I8IMMShift = VEX_4VOp3Shift + 1, + VEX_I8IMM = 1ULL << VEX_I8IMMShift, + + /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current + /// instruction uses 256-bit wide registers. This is usually auto detected + /// if a VR256 register is used, but some AVX instructions also have this + /// field marked when using a f256 memory references. + VEX_LShift = VEX_I8IMMShift + 1, + VEX_L = 1ULL << VEX_LShift, + + // VEX_LIG - Specifies that this instruction ignores the L-bit in the VEX + // prefix. Usually used for scalar instructions. Needed by disassembler. + VEX_LIGShift = VEX_LShift + 1, + VEX_LIG = 1ULL << VEX_LIGShift, + + // TODO: we should combine VEX_L and VEX_LIG together to form a 2-bit field + // with following encoding: + // - 00 V128 + // - 01 V256 + // - 10 V512 + // - 11 LIG (but, in insn encoding, leave VEX.L and EVEX.L in zeros. + // this will save 1 tsflag bit + + // EVEX_K - Set if this instruction requires masking + EVEX_KShift = VEX_LIGShift + 1, + EVEX_K = 1ULL << EVEX_KShift, + + // EVEX_Z - Set if this instruction has EVEX.Z field set. + EVEX_ZShift = EVEX_KShift + 1, + EVEX_Z = 1ULL << EVEX_ZShift, + + // EVEX_L2 - Set if this instruction has EVEX.L' field set. + EVEX_L2Shift = EVEX_ZShift + 1, + EVEX_L2 = 1ULL << EVEX_L2Shift, + + // EVEX_B - Set if this instruction has EVEX.B field set. + EVEX_BShift = EVEX_L2Shift + 1, + EVEX_B = 1ULL << EVEX_BShift, + + // The scaling factor for the AVX512's 8-bit compressed displacement. + CD8_Scale_Shift = EVEX_BShift + 1, + CD8_Scale_Mask = 127ULL << CD8_Scale_Shift, + + /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the + /// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents + /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction + /// storing a classifier in the imm8 field. To simplify our implementation, + /// we handle this by storeing the classifier in the opcode field and using + /// this flag to indicate that the encoder should do the wacky 3DNow! thing. + Has3DNow0F0FOpcodeShift = CD8_Scale_Shift + 7, + Has3DNow0F0FOpcode = 1ULL << Has3DNow0F0FOpcodeShift, + + /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in + /// ModRM or I8IMM. This is used for FMA4 and XOP instructions. + MemOp4Shift = Has3DNow0F0FOpcodeShift + 1, + MemOp4 = 1ULL << MemOp4Shift, + + /// Explicitly specified rounding control + EVEX_RCShift = MemOp4Shift + 1, + EVEX_RC = 1ULL << EVEX_RCShift + }; + + // getBaseOpcodeFor - This function returns the "base" X86 opcode for the + // specified machine instruction. + // + inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) { + return TSFlags >> X86II::OpcodeShift; + } + + inline bool hasImm(uint64_t TSFlags) { + return (TSFlags & X86II::ImmMask) != 0; + } + + /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field + /// of the specified instruction. + inline unsigned getSizeOfImm(uint64_t TSFlags) { + switch (TSFlags & X86II::ImmMask) { + default: llvm_unreachable("Unknown immediate size"); + case X86II::Imm8: + case X86II::Imm8PCRel: return 1; + case X86II::Imm16: + case X86II::Imm16PCRel: return 2; + case X86II::Imm32: + case X86II::Imm32S: + case X86II::Imm32PCRel: return 4; + case X86II::Imm64: return 8; + } + } + + /// isImmPCRel - Return true if the immediate of the specified instruction's + /// TSFlags indicates that it is pc relative. + inline unsigned isImmPCRel(uint64_t TSFlags) { + switch (TSFlags & X86II::ImmMask) { + default: llvm_unreachable("Unknown immediate size"); + case X86II::Imm8PCRel: + case X86II::Imm16PCRel: + case X86II::Imm32PCRel: + return true; + case X86II::Imm8: + case X86II::Imm16: + case X86II::Imm32: + case X86II::Imm32S: + case X86II::Imm64: + return false; + } + } + + /// isImmSigned - Return true if the immediate of the specified instruction's + /// TSFlags indicates that it is signed. + inline unsigned isImmSigned(uint64_t TSFlags) { + switch (TSFlags & X86II::ImmMask) { + default: llvm_unreachable("Unknown immediate signedness"); + case X86II::Imm32S: + return true; + case X86II::Imm8: + case X86II::Imm8PCRel: + case X86II::Imm16: + case X86II::Imm16PCRel: + case X86II::Imm32: + case X86II::Imm32PCRel: + case X86II::Imm64: + return false; + } + } + + /// getOperandBias - compute any additional adjustment needed to + /// the offset to the start of the memory operand + /// in this instruction. + /// If this is a two-address instruction,skip one of the register operands. + /// FIXME: This should be handled during MCInst lowering. + inline int getOperandBias(const MCInstrDesc& Desc) + { + unsigned NumOps = Desc.getNumOperands(); + unsigned CurOp = 0; + if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0) + ++CurOp; + else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && + Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1) + // Special case for AVX-512 GATHER with 2 TIED_TO operands + // Skip the first 2 operands: dst, mask_wb + CurOp += 2; + else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 && + Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1) + // Special case for GATHER with 2 TIED_TO operands + // Skip the first 2 operands: dst, mask_wb + CurOp += 2; + else if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0) + // SCATTER + ++CurOp; + return CurOp; + } + + /// getMemoryOperandNo - The function returns the MCInst operand # for the + /// first field of the memory operand. If the instruction doesn't have a + /// memory operand, this returns -1. + /// + /// Note that this ignores tied operands. If there is a tied register which + /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only + /// counted as one operand. + /// + inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) { + bool HasVEX_4V = TSFlags & X86II::VEX_4V; + bool HasMemOp4 = TSFlags & X86II::MemOp4; + bool HasEVEX_K = TSFlags & X86II::EVEX_K; + + switch (TSFlags & X86II::FormMask) { + default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!"); + case X86II::Pseudo: + case X86II::RawFrm: + case X86II::AddRegFrm: + case X86II::MRMDestReg: + case X86II::MRMSrcReg: + case X86II::RawFrmImm8: + case X86II::RawFrmImm16: + case X86II::RawFrmMemOffs: + case X86II::RawFrmSrc: + case X86II::RawFrmDst: + case X86II::RawFrmDstSrc: + return -1; + case X86II::MRMDestMem: + return 0; + case X86II::MRMSrcMem: + // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a + // mask register. + return 1 + HasVEX_4V + HasMemOp4 + HasEVEX_K; + case X86II::MRMXr: + case X86II::MRM0r: case X86II::MRM1r: + case X86II::MRM2r: case X86II::MRM3r: + case X86II::MRM4r: case X86II::MRM5r: + case X86II::MRM6r: case X86II::MRM7r: + return -1; + case X86II::MRMXm: + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: + // Start from 0, skip registers encoded in VEX_VVVV or a mask register. + return 0 + HasVEX_4V + HasEVEX_K; + case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: + case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5: + case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8: + case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB: + case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE: + case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1: + case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4: + case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7: + case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA: + case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD: + case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0: + case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3: + case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6: + case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9: + case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC: + case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF: + case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2: + case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5: + case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8: + case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB: + case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE: + case X86II::MRM_FF: + return -1; + } + } + + /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or + /// higher) register? e.g. r8, xmm8, xmm13, etc. + inline bool isX86_64ExtendedReg(unsigned RegNo) { + if ((RegNo > X86::XMM7 && RegNo <= X86::XMM15) || + (RegNo > X86::XMM23 && RegNo <= X86::XMM31) || + (RegNo > X86::YMM7 && RegNo <= X86::YMM15) || + (RegNo > X86::YMM23 && RegNo <= X86::YMM31) || + (RegNo > X86::ZMM7 && RegNo <= X86::ZMM15) || + (RegNo > X86::ZMM23 && RegNo <= X86::ZMM31)) + return true; + + switch (RegNo) { + default: break; + case X86::R8: case X86::R9: case X86::R10: case X86::R11: + case X86::R12: case X86::R13: case X86::R14: case X86::R15: + case X86::R8D: case X86::R9D: case X86::R10D: case X86::R11D: + case X86::R12D: case X86::R13D: case X86::R14D: case X86::R15D: + case X86::R8W: case X86::R9W: case X86::R10W: case X86::R11W: + case X86::R12W: case X86::R13W: case X86::R14W: case X86::R15W: + case X86::R8B: case X86::R9B: case X86::R10B: case X86::R11B: + case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B: + case X86::CR8: case X86::CR9: case X86::CR10: case X86::CR11: + case X86::CR12: case X86::CR13: case X86::CR14: case X86::CR15: + return true; + } + return false; + } + + /// is32ExtendedReg - Is the MemoryOperand a 32 extended (zmm16 or higher) + /// registers? e.g. zmm21, etc. + static inline bool is32ExtendedReg(unsigned RegNo) { + return ((RegNo > X86::XMM15 && RegNo <= X86::XMM31) || + (RegNo > X86::YMM15 && RegNo <= X86::YMM31) || + (RegNo > X86::ZMM15 && RegNo <= X86::ZMM31)); + } + + + inline bool isX86_64NonExtLowByteReg(unsigned reg) { + return (reg == X86::SPL || reg == X86::BPL || + reg == X86::SIL || reg == X86::DIL); + } +} + +} // end namespace llvm; + +#endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp new file mode 100644 index 0000000..736c39d --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -0,0 +1,262 @@ +//===-- X86ELFObjectWriter.cpp - X86 ELF Writer ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86FixupKinds.h" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +namespace { + class X86ELFObjectWriter : public MCELFObjectTargetWriter { + public: + X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine); + + ~X86ELFObjectWriter() override; + + protected: + unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, + bool IsPCRel) const override; + }; +} + +X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, + uint16_t EMachine) + : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine, + // Only i386 and IAMCU use Rel instead of RelA. + /*HasRelocationAddend*/ + (EMachine != ELF::EM_386) && + (EMachine != ELF::EM_IAMCU)) {} + +X86ELFObjectWriter::~X86ELFObjectWriter() +{} + +enum X86_64RelType { RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 }; + +static X86_64RelType getType64(unsigned Kind, + MCSymbolRefExpr::VariantKind &Modifier, + bool &IsPCRel) { + switch (Kind) { + default: + llvm_unreachable("Unimplemented"); + case X86::reloc_global_offset_table8: + Modifier = MCSymbolRefExpr::VK_GOT; + IsPCRel = true; + return RT64_64; + case FK_Data_8: + return RT64_64; + case X86::reloc_signed_4byte: + if (Modifier == MCSymbolRefExpr::VK_None && !IsPCRel) + return RT64_32S; + return RT64_32; + case X86::reloc_global_offset_table: + Modifier = MCSymbolRefExpr::VK_GOT; + IsPCRel = true; + return RT64_32; + case FK_Data_4: + case FK_PCRel_4: + case X86::reloc_riprel_4byte: + case X86::reloc_riprel_4byte_movq_load: + return RT64_32; + case FK_PCRel_2: + case FK_Data_2: + return RT64_16; + case FK_PCRel_1: + case FK_Data_1: + return RT64_8; + } +} + +static unsigned getRelocType64(MCSymbolRefExpr::VariantKind Modifier, + X86_64RelType Type, bool IsPCRel) { + switch (Modifier) { + default: + llvm_unreachable("Unimplemented"); + case MCSymbolRefExpr::VK_None: + switch (Type) { + case RT64_64: + return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64; + case RT64_32: + return IsPCRel ? ELF::R_X86_64_PC32 : ELF::R_X86_64_32; + case RT64_32S: + return ELF::R_X86_64_32S; + case RT64_16: + return IsPCRel ? ELF::R_X86_64_PC16 : ELF::R_X86_64_16; + case RT64_8: + return IsPCRel ? ELF::R_X86_64_PC8 : ELF::R_X86_64_8; + } + case MCSymbolRefExpr::VK_GOT: + switch (Type) { + case RT64_64: + return IsPCRel ? ELF::R_X86_64_GOTPC64 : ELF::R_X86_64_GOT64; + case RT64_32: + return IsPCRel ? ELF::R_X86_64_GOTPC32 : ELF::R_X86_64_GOT32; + case RT64_32S: + case RT64_16: + case RT64_8: + llvm_unreachable("Unimplemented"); + } + case MCSymbolRefExpr::VK_GOTOFF: + assert(Type == RT64_64); + assert(!IsPCRel); + return ELF::R_X86_64_GOTOFF64; + case MCSymbolRefExpr::VK_TPOFF: + assert(!IsPCRel); + switch (Type) { + case RT64_64: + return ELF::R_X86_64_TPOFF64; + case RT64_32: + return ELF::R_X86_64_TPOFF32; + case RT64_32S: + case RT64_16: + case RT64_8: + llvm_unreachable("Unimplemented"); + } + case MCSymbolRefExpr::VK_DTPOFF: + assert(!IsPCRel); + switch (Type) { + case RT64_64: + return ELF::R_X86_64_DTPOFF64; + case RT64_32: + return ELF::R_X86_64_DTPOFF32; + case RT64_32S: + case RT64_16: + case RT64_8: + llvm_unreachable("Unimplemented"); + } + case MCSymbolRefExpr::VK_SIZE: + assert(!IsPCRel); + switch (Type) { + case RT64_64: + return ELF::R_X86_64_SIZE64; + case RT64_32: + return ELF::R_X86_64_SIZE32; + case RT64_32S: + case RT64_16: + case RT64_8: + llvm_unreachable("Unimplemented"); + } + case MCSymbolRefExpr::VK_TLSGD: + assert(Type == RT64_32); + return ELF::R_X86_64_TLSGD; + case MCSymbolRefExpr::VK_GOTTPOFF: + assert(Type == RT64_32); + return ELF::R_X86_64_GOTTPOFF; + case MCSymbolRefExpr::VK_TLSLD: + assert(Type == RT64_32); + return ELF::R_X86_64_TLSLD; + case MCSymbolRefExpr::VK_PLT: + assert(Type == RT64_32); + return ELF::R_X86_64_PLT32; + case MCSymbolRefExpr::VK_GOTPCREL: + assert(Type == RT64_32); + return ELF::R_X86_64_GOTPCREL; + } +} + +enum X86_32RelType { RT32_32, RT32_16, RT32_8 }; + +static X86_32RelType getType32(X86_64RelType T) { + switch (T) { + case RT64_64: + llvm_unreachable("Unimplemented"); + case RT64_32: + case RT64_32S: + return RT32_32; + case RT64_16: + return RT32_16; + case RT64_8: + return RT32_8; + } + llvm_unreachable("unexpected relocation type!"); +} + +static unsigned getRelocType32(MCSymbolRefExpr::VariantKind Modifier, + X86_32RelType Type, bool IsPCRel) { + switch (Modifier) { + default: + llvm_unreachable("Unimplemented"); + case MCSymbolRefExpr::VK_None: + switch (Type) { + case RT32_32: + return IsPCRel ? ELF::R_386_PC32 : ELF::R_386_32; + case RT32_16: + return IsPCRel ? ELF::R_386_PC16 : ELF::R_386_16; + case RT32_8: + return IsPCRel ? ELF::R_386_PC8 : ELF::R_386_8; + } + case MCSymbolRefExpr::VK_GOT: + assert(Type == RT32_32); + return IsPCRel ? ELF::R_386_GOTPC : ELF::R_386_GOT32; + case MCSymbolRefExpr::VK_GOTOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_GOTOFF; + case MCSymbolRefExpr::VK_TPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_LE_32; + case MCSymbolRefExpr::VK_DTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_LDO_32; + case MCSymbolRefExpr::VK_TLSGD: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_GD; + case MCSymbolRefExpr::VK_GOTTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_IE_32; + case MCSymbolRefExpr::VK_PLT: + assert(Type == RT32_32); + return ELF::R_386_PLT32; + case MCSymbolRefExpr::VK_INDNTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_IE; + case MCSymbolRefExpr::VK_NTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_LE; + case MCSymbolRefExpr::VK_GOTNTPOFF: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_GOTIE; + case MCSymbolRefExpr::VK_TLSLDM: + assert(Type == RT32_32); + assert(!IsPCRel); + return ELF::R_386_TLS_LDM; + } +} + +unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const { + MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant(); + X86_64RelType Type = getType64(Fixup.getKind(), Modifier, IsPCRel); + if (getEMachine() == ELF::EM_X86_64) + return getRelocType64(Modifier, Type, IsPCRel); + + assert((getEMachine() == ELF::EM_386 || getEMachine() == ELF::EM_IAMCU) && + "Unsupported ELF machine type."); + return getRelocType32(Modifier, getType32(Type), IsPCRel); +} + +MCObjectWriter *llvm::createX86ELFObjectWriter(raw_pwrite_stream &OS, + bool IsELF64, uint8_t OSABI, + uint16_t EMachine) { + MCELFObjectTargetWriter *MOTW = + new X86ELFObjectWriter(IsELF64, OSABI, EMachine); + return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true); +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp new file mode 100644 index 0000000..ddb764f --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp @@ -0,0 +1,141 @@ +//===-- X86ELFRelocationInfo.cpp ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCRelocationInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Support/ELF.h" + +using namespace llvm; +using namespace object; +using namespace ELF; + +namespace { +class X86_64ELFRelocationInfo : public MCRelocationInfo { +public: + X86_64ELFRelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {} + + const MCExpr *createExprForRelocation(RelocationRef Rel) override { + uint64_t RelType = Rel.getType(); + elf_symbol_iterator SymI = Rel.getSymbol(); + + ErrorOr<StringRef> SymNameOrErr = SymI->getName(); + if (std::error_code EC = SymNameOrErr.getError()) + report_fatal_error(EC.message()); + StringRef SymName = *SymNameOrErr; + + ErrorOr<uint64_t> SymAddr = SymI->getAddress(); + if (std::error_code EC = SymAddr.getError()) + report_fatal_error(EC.message()); + uint64_t SymSize = SymI->getSize(); + int64_t Addend = *ELFRelocationRef(Rel).getAddend(); + + MCSymbol *Sym = Ctx.getOrCreateSymbol(SymName); + // FIXME: check that the value is actually the same. + if (!Sym->isVariable()) + Sym->setVariableValue(MCConstantExpr::create(*SymAddr, Ctx)); + + const MCExpr *Expr = nullptr; + // If hasAddend is true, then we need to add Addend (r_addend) to Expr. + bool hasAddend = false; + + // The AMD64 SysV ABI says: + // A: the addend used to compute the value of the relocatable field. + // B: the base address at which a shared object has been loaded into memory + // during execution. Generally, a shared object is built with a 0 base + // virtual address, but the execution address will be different. + // G: the offset into the global offset table at which the relocation + // entry's symbol will reside during execution. + // GOT: the address of the global offset table. + // L: the place (section offset or address) of the Procedure Linkage Table + // entry for a symbol. + // P: the place (section offset or address) of the storage unit being + // relocated (computed using r_offset). + // S: the value of the symbol whose index resides in the relocation entry. + // Z: the size of the symbol whose index resides in the relocation entry. + + switch(RelType) { + case R_X86_64_NONE: + case R_X86_64_COPY: + // none + break; + case R_X86_64_64: + case R_X86_64_16: + case R_X86_64_8: + // S + A + case R_X86_64_32: + case R_X86_64_32S: + // S + A (We don't care about the result not fitting in 32 bits.) + case R_X86_64_PC32: + case R_X86_64_PC16: + case R_X86_64_PC8: + case R_X86_64_PC64: + // S + A - P (P/pcrel is implicit) + hasAddend = true; + Expr = MCSymbolRefExpr::create(Sym, Ctx); + break; + case R_X86_64_GOT32: + case R_X86_64_GOT64: + case R_X86_64_GOTPC32: + case R_X86_64_GOTPC64: + case R_X86_64_GOTPLT64: + // G + A + hasAddend = true; + Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, Ctx); + break; + case R_X86_64_PLT32: + // L + A - P -> S@PLT + A + hasAddend = true; + Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_PLT, Ctx); + break; + case R_X86_64_GLOB_DAT: + case R_X86_64_JUMP_SLOT: + // S + Expr = MCSymbolRefExpr::create(Sym, Ctx); + break; + case R_X86_64_GOTPCREL: + case R_X86_64_GOTPCREL64: + // G + GOT + A - P -> S@GOTPCREL + A + hasAddend = true; + Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx); + break; + case R_X86_64_GOTOFF64: + // S + A - GOT + Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTOFF, Ctx); + break; + case R_X86_64_PLTOFF64: + // L + A - GOT + break; + case R_X86_64_SIZE32: + case R_X86_64_SIZE64: + // Z + A + Expr = MCConstantExpr::create(SymSize, Ctx); + break; + default: + Expr = MCSymbolRefExpr::create(Sym, Ctx); + break; + } + if (Expr && hasAddend && Addend != 0) + Expr = MCBinaryExpr::createAdd(Expr, + MCConstantExpr::create(Addend, Ctx), + Ctx); + return Expr; + } +}; +} // End unnamed namespace + +/// createX86ELFRelocationInfo - Construct an X86 Mach-O RelocationInfo. +MCRelocationInfo *llvm::createX86_64ELFRelocationInfo(MCContext &Ctx) { + // We only handle x86-64 for now. + return new X86_64ELFRelocationInfo(Ctx); +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h new file mode 100644 index 0000000..4899900 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h @@ -0,0 +1,34 @@ +//===-- X86FixupKinds.h - X86 Specific Fixup Entries ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86FIXUPKINDS_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86FIXUPKINDS_H + +#include "llvm/MC/MCFixup.h" + +namespace llvm { +namespace X86 { +enum Fixups { + reloc_riprel_4byte = FirstTargetFixupKind, // 32-bit rip-relative + reloc_riprel_4byte_movq_load, // 32-bit rip-relative in movq + reloc_signed_4byte, // 32-bit signed. Unlike FK_Data_4 + // this will be sign extended at + // runtime. + reloc_global_offset_table, // 32-bit, relative to the start + // of the instruction. Used only + // for _GLOBAL_OFFSET_TABLE_. + reloc_global_offset_table8, // 64-bit variant. + // Marker + LastTargetFixupKind, + NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind +}; +} +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp new file mode 100644 index 0000000..fc0b0f8 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -0,0 +1,172 @@ +//===-- X86MCAsmInfo.cpp - X86 asm properties -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the X86MCAsmInfo properties. +// +//===----------------------------------------------------------------------===// + +#include "X86MCAsmInfo.h" +#include "llvm/ADT/Triple.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ELF.h" +using namespace llvm; + +enum AsmWriterFlavorTy { + // Note: This numbering has to match the GCC assembler dialects for inline + // asm alternatives to work right. + ATT = 0, Intel = 1 +}; + +static cl::opt<AsmWriterFlavorTy> +AsmWriterFlavor("x86-asm-syntax", cl::init(ATT), + cl::desc("Choose style of code to emit from X86 backend:"), + cl::values(clEnumValN(ATT, "att", "Emit AT&T-style assembly"), + clEnumValN(Intel, "intel", "Emit Intel-style assembly"), + clEnumValEnd)); + +static cl::opt<bool> +MarkedJTDataRegions("mark-data-regions", cl::init(false), + cl::desc("Mark code section jump table data regions."), + cl::Hidden); + +void X86MCAsmInfoDarwin::anchor() { } + +X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) { + bool is64Bit = T.getArch() == Triple::x86_64; + if (is64Bit) + PointerSize = CalleeSaveStackSlotSize = 8; + + AssemblerDialect = AsmWriterFlavor; + + TextAlignFillValue = 0x90; + + if (!is64Bit) + Data64bitsDirective = nullptr; // we can't emit a 64-bit unit + + // Use ## as a comment string so that .s files generated by llvm can go + // through the GCC preprocessor without causing an error. This is needed + // because "clang foo.s" runs the C preprocessor, which is usually reserved + // for .S files on other systems. Perhaps this is because the file system + // wasn't always case preserving or something. + CommentString = "##"; + + SupportsDebugInformation = true; + UseDataRegionDirectives = MarkedJTDataRegions; + + // Exceptions handling + ExceptionsType = ExceptionHandling::DwarfCFI; + + // old assembler lacks some directives + // FIXME: this should really be a check on the assembler characteristics + // rather than OS version + if (T.isMacOSX() && T.isMacOSXVersionLT(10, 6)) + HasWeakDefCanBeHiddenDirective = false; + + // Assume ld64 is new enough that the abs-ified FDE relocs may be used + // (actually, must, since otherwise the non-extern relocations we produce + // overwhelm ld64's tiny little mind and it fails). + DwarfFDESymbolsUseAbsDiff = true; + + UseIntegratedAssembler = true; +} + +X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple) + : X86MCAsmInfoDarwin(Triple) { +} + +void X86ELFMCAsmInfo::anchor() { } + +X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { + bool is64Bit = T.getArch() == Triple::x86_64; + bool isX32 = T.getEnvironment() == Triple::GNUX32; + + // For ELF, x86-64 pointer size depends on the ABI. + // For x86-64 without the x32 ABI, pointer size is 8. For x86 and for x86-64 + // with the x32 ABI, pointer size remains the default 4. + PointerSize = (is64Bit && !isX32) ? 8 : 4; + + // OTOH, stack slot size is always 8 for x86-64, even with the x32 ABI. + CalleeSaveStackSlotSize = is64Bit ? 8 : 4; + + AssemblerDialect = AsmWriterFlavor; + + TextAlignFillValue = 0x90; + + // Debug Information + SupportsDebugInformation = true; + + // Exceptions handling + ExceptionsType = ExceptionHandling::DwarfCFI; + + // Always enable the integrated assembler by default. + // Clang also enabled it when the OS is Solaris but that is redundant here. + UseIntegratedAssembler = true; +} + +const MCExpr * +X86_64MCAsmInfoDarwin::getExprForPersonalitySymbol(const MCSymbol *Sym, + unsigned Encoding, + MCStreamer &Streamer) const { + MCContext &Context = Streamer.getContext(); + const MCExpr *Res = + MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Context); + const MCExpr *Four = MCConstantExpr::create(4, Context); + return MCBinaryExpr::createAdd(Res, Four, Context); +} + +void X86MCAsmInfoMicrosoft::anchor() { } + +X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { + if (Triple.getArch() == Triple::x86_64) { + PrivateGlobalPrefix = ".L"; + PrivateLabelPrefix = ".L"; + PointerSize = 8; + WinEHEncodingType = WinEH::EncodingType::Itanium; + } else { + // 32-bit X86 doesn't use CFI, so this isn't a real encoding type. It's just + // a place holder that the Windows EHStreamer looks for to suppress CFI + // output. In particular, usesWindowsCFI() returns false. + WinEHEncodingType = WinEH::EncodingType::X86; + } + + ExceptionsType = ExceptionHandling::WinEH; + + AssemblerDialect = AsmWriterFlavor; + + TextAlignFillValue = 0x90; + + AllowAtInName = true; + + UseIntegratedAssembler = true; +} + +void X86MCAsmInfoGNUCOFF::anchor() { } + +X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) { + assert(Triple.isOSWindows() && "Windows is the only supported COFF target"); + if (Triple.getArch() == Triple::x86_64) { + PrivateGlobalPrefix = ".L"; + PrivateLabelPrefix = ".L"; + PointerSize = 8; + WinEHEncodingType = WinEH::EncodingType::Itanium; + ExceptionsType = ExceptionHandling::WinEH; + } else { + ExceptionsType = ExceptionHandling::DwarfCFI; + } + + AssemblerDialect = AsmWriterFlavor; + + TextAlignFillValue = 0x90; + + UseIntegratedAssembler = true; +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h new file mode 100644 index 0000000..30d5c80 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h @@ -0,0 +1,61 @@ +//===-- X86MCAsmInfo.h - X86 asm properties --------------------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the X86MCAsmInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H + +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAsmInfoCOFF.h" +#include "llvm/MC/MCAsmInfoDarwin.h" +#include "llvm/MC/MCAsmInfoELF.h" + +namespace llvm { +class Triple; + +class X86MCAsmInfoDarwin : public MCAsmInfoDarwin { + virtual void anchor(); + +public: + explicit X86MCAsmInfoDarwin(const Triple &Triple); +}; + +struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin { + explicit X86_64MCAsmInfoDarwin(const Triple &Triple); + const MCExpr * + getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding, + MCStreamer &Streamer) const override; +}; + +class X86ELFMCAsmInfo : public MCAsmInfoELF { + void anchor() override; + +public: + explicit X86ELFMCAsmInfo(const Triple &Triple); +}; + +class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft { + void anchor() override; + +public: + explicit X86MCAsmInfoMicrosoft(const Triple &Triple); +}; + +class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF { + void anchor() override; + +public: + explicit X86MCAsmInfoGNUCOFF(const Triple &Triple); +}; +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp new file mode 100644 index 0000000..dfab6ec --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -0,0 +1,1506 @@ +//===-- X86MCCodeEmitter.cpp - Convert X86 code to machine code -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the X86MCCodeEmitter class. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86FixupKinds.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "mccodeemitter" + +namespace { +class X86MCCodeEmitter : public MCCodeEmitter { + X86MCCodeEmitter(const X86MCCodeEmitter &) = delete; + void operator=(const X86MCCodeEmitter &) = delete; + const MCInstrInfo &MCII; + MCContext &Ctx; +public: + X86MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) + : MCII(mcii), Ctx(ctx) { + } + + ~X86MCCodeEmitter() override {} + + bool is64BitMode(const MCSubtargetInfo &STI) const { + return STI.getFeatureBits()[X86::Mode64Bit]; + } + + bool is32BitMode(const MCSubtargetInfo &STI) const { + return STI.getFeatureBits()[X86::Mode32Bit]; + } + + bool is16BitMode(const MCSubtargetInfo &STI) const { + return STI.getFeatureBits()[X86::Mode16Bit]; + } + + /// Is16BitMemOperand - Return true if the specified instruction has + /// a 16-bit memory operand. Op specifies the operand # of the memoperand. + bool Is16BitMemOperand(const MCInst &MI, unsigned Op, + const MCSubtargetInfo &STI) const { + const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); + const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); + const MCOperand &Disp = MI.getOperand(Op+X86::AddrDisp); + + if (is16BitMode(STI) && BaseReg.getReg() == 0 && + Disp.isImm() && Disp.getImm() < 0x10000) + return true; + if ((BaseReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) || + (IndexReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg()))) + return true; + return false; + } + + unsigned GetX86RegNum(const MCOperand &MO) const { + return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7; + } + + // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range + // 0-7 and the difference between the 2 groups is given by the REX prefix. + // In the VEX prefix, registers are seen sequencially from 0-15 and encoded + // in 1's complement form, example: + // + // ModRM field => XMM9 => 1 + // VEX.VVVV => XMM9 => ~9 + // + // See table 4-35 of Intel AVX Programming Reference for details. + unsigned char getVEXRegisterEncoding(const MCInst &MI, + unsigned OpNum) const { + unsigned SrcReg = MI.getOperand(OpNum).getReg(); + unsigned SrcRegNum = GetX86RegNum(MI.getOperand(OpNum)); + if (X86II::isX86_64ExtendedReg(SrcReg)) + SrcRegNum |= 8; + + // The registers represented through VEX_VVVV should + // be encoded in 1's complement form. + return (~SrcRegNum) & 0xf; + } + + unsigned char getWriteMaskRegisterEncoding(const MCInst &MI, + unsigned OpNum) const { + assert(X86::K0 != MI.getOperand(OpNum).getReg() && + "Invalid mask register as write-mask!"); + unsigned MaskRegNum = GetX86RegNum(MI.getOperand(OpNum)); + return MaskRegNum; + } + + void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const { + OS << (char)C; + ++CurByte; + } + + void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte, + raw_ostream &OS) const { + // Output the constant in little endian byte order. + for (unsigned i = 0; i != Size; ++i) { + EmitByte(Val & 255, CurByte, OS); + Val >>= 8; + } + } + + void EmitImmediate(const MCOperand &Disp, SMLoc Loc, + unsigned ImmSize, MCFixupKind FixupKind, + unsigned &CurByte, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + int ImmOffset = 0) const; + + inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode, + unsigned RM) { + assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!"); + return RM | (RegOpcode << 3) | (Mod << 6); + } + + void EmitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld, + unsigned &CurByte, raw_ostream &OS) const { + EmitByte(ModRMByte(3, RegOpcodeFld, GetX86RegNum(ModRMReg)), CurByte, OS); + } + + void EmitSIBByte(unsigned SS, unsigned Index, unsigned Base, + unsigned &CurByte, raw_ostream &OS) const { + // SIB byte is in the same format as the ModRMByte. + EmitByte(ModRMByte(SS, Index, Base), CurByte, OS); + } + + + void EmitMemModRMByte(const MCInst &MI, unsigned Op, + unsigned RegOpcodeField, + uint64_t TSFlags, unsigned &CurByte, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + void encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; + + void EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, + const MCInst &MI, const MCInstrDesc &Desc, + raw_ostream &OS) const; + + void EmitSegmentOverridePrefix(unsigned &CurByte, unsigned SegOperand, + const MCInst &MI, raw_ostream &OS) const; + + void EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, + const MCInst &MI, const MCInstrDesc &Desc, + const MCSubtargetInfo &STI, + raw_ostream &OS) const; +}; + +} // end anonymous namespace + +MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx) { + return new X86MCCodeEmitter(MCII, Ctx); +} + +/// isDisp8 - Return true if this signed displacement fits in a 8-bit +/// sign-extended field. +static bool isDisp8(int Value) { + return Value == (signed char)Value; +} + +/// isCDisp8 - Return true if this signed displacement fits in a 8-bit +/// compressed dispacement field. +static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) { + assert(((TSFlags & X86II::EncodingMask) == X86II::EVEX) && + "Compressed 8-bit displacement is only valid for EVEX inst."); + + unsigned CD8_Scale = + (TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift; + if (CD8_Scale == 0) { + CValue = Value; + return isDisp8(Value); + } + + unsigned Mask = CD8_Scale - 1; + assert((CD8_Scale & Mask) == 0 && "Invalid memory object size."); + if (Value & Mask) // Unaligned offset + return false; + Value /= (int)CD8_Scale; + bool Ret = (Value == (signed char)Value); + + if (Ret) + CValue = Value; + return Ret; +} + +/// getImmFixupKind - Return the appropriate fixup kind to use for an immediate +/// in an instruction with the specified TSFlags. +static MCFixupKind getImmFixupKind(uint64_t TSFlags) { + unsigned Size = X86II::getSizeOfImm(TSFlags); + bool isPCRel = X86II::isImmPCRel(TSFlags); + + if (X86II::isImmSigned(TSFlags)) { + switch (Size) { + default: llvm_unreachable("Unsupported signed fixup size!"); + case 4: return MCFixupKind(X86::reloc_signed_4byte); + } + } + return MCFixup::getKindForSize(Size, isPCRel); +} + +/// Is32BitMemOperand - Return true if the specified instruction has +/// a 32-bit memory operand. Op specifies the operand # of the memoperand. +static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) { + const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); + const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); + + if ((BaseReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) || + (IndexReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg()))) + return true; + return false; +} + +/// Is64BitMemOperand - Return true if the specified instruction has +/// a 64-bit memory operand. Op specifies the operand # of the memoperand. +#ifndef NDEBUG +static bool Is64BitMemOperand(const MCInst &MI, unsigned Op) { + const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); + const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); + + if ((BaseReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) || + (IndexReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg()))) + return true; + return false; +} +#endif + +/// StartsWithGlobalOffsetTable - Check if this expression starts with +/// _GLOBAL_OFFSET_TABLE_ and if it is of the form +/// _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on ELF +/// i386 as _GLOBAL_OFFSET_TABLE_ is magical. We check only simple case that +/// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start +/// of a binary expression. +enum GlobalOffsetTableExprKind { + GOT_None, + GOT_Normal, + GOT_SymDiff +}; +static GlobalOffsetTableExprKind +StartsWithGlobalOffsetTable(const MCExpr *Expr) { + const MCExpr *RHS = nullptr; + if (Expr->getKind() == MCExpr::Binary) { + const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr); + Expr = BE->getLHS(); + RHS = BE->getRHS(); + } + + if (Expr->getKind() != MCExpr::SymbolRef) + return GOT_None; + + const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr); + const MCSymbol &S = Ref->getSymbol(); + if (S.getName() != "_GLOBAL_OFFSET_TABLE_") + return GOT_None; + if (RHS && RHS->getKind() == MCExpr::SymbolRef) + return GOT_SymDiff; + return GOT_Normal; +} + +static bool HasSecRelSymbolRef(const MCExpr *Expr) { + if (Expr->getKind() == MCExpr::SymbolRef) { + const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr); + return Ref->getKind() == MCSymbolRefExpr::VK_SECREL; + } + return false; +} + +void X86MCCodeEmitter:: +EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size, + MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const { + const MCExpr *Expr = nullptr; + if (DispOp.isImm()) { + // If this is a simple integer displacement that doesn't require a + // relocation, emit it now. + if (FixupKind != FK_PCRel_1 && + FixupKind != FK_PCRel_2 && + FixupKind != FK_PCRel_4) { + EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS); + return; + } + Expr = MCConstantExpr::create(DispOp.getImm(), Ctx); + } else { + Expr = DispOp.getExpr(); + } + + // If we have an immoffset, add it to the expression. + if ((FixupKind == FK_Data_4 || + FixupKind == FK_Data_8 || + FixupKind == MCFixupKind(X86::reloc_signed_4byte))) { + GlobalOffsetTableExprKind Kind = StartsWithGlobalOffsetTable(Expr); + if (Kind != GOT_None) { + assert(ImmOffset == 0); + + if (Size == 8) { + FixupKind = MCFixupKind(X86::reloc_global_offset_table8); + } else { + assert(Size == 4); + FixupKind = MCFixupKind(X86::reloc_global_offset_table); + } + + if (Kind == GOT_Normal) + ImmOffset = CurByte; + } else if (Expr->getKind() == MCExpr::SymbolRef) { + if (HasSecRelSymbolRef(Expr)) { + FixupKind = MCFixupKind(FK_SecRel_4); + } + } else if (Expr->getKind() == MCExpr::Binary) { + const MCBinaryExpr *Bin = static_cast<const MCBinaryExpr*>(Expr); + if (HasSecRelSymbolRef(Bin->getLHS()) + || HasSecRelSymbolRef(Bin->getRHS())) { + FixupKind = MCFixupKind(FK_SecRel_4); + } + } + } + + // If the fixup is pc-relative, we need to bias the value to be relative to + // the start of the field, not the end of the field. + if (FixupKind == FK_PCRel_4 || + FixupKind == MCFixupKind(X86::reloc_riprel_4byte) || + FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load)) + ImmOffset -= 4; + if (FixupKind == FK_PCRel_2) + ImmOffset -= 2; + if (FixupKind == FK_PCRel_1) + ImmOffset -= 1; + + if (ImmOffset) + Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(ImmOffset, Ctx), + Ctx); + + // Emit a symbolic constant as a fixup and 4 zeros. + Fixups.push_back(MCFixup::create(CurByte, Expr, FixupKind, Loc)); + EmitConstant(0, Size, CurByte, OS); +} + +void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, + unsigned RegOpcodeField, + uint64_t TSFlags, unsigned &CurByte, + raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const{ + const MCOperand &Disp = MI.getOperand(Op+X86::AddrDisp); + const MCOperand &Base = MI.getOperand(Op+X86::AddrBaseReg); + const MCOperand &Scale = MI.getOperand(Op+X86::AddrScaleAmt); + const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); + unsigned BaseReg = Base.getReg(); + bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX; + + // Handle %rip relative addressing. + if (BaseReg == X86::RIP) { // [disp32+RIP] in X86-64 mode + assert(is64BitMode(STI) && "Rip-relative addressing requires 64-bit mode"); + assert(IndexReg.getReg() == 0 && "Invalid rip-relative address"); + EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS); + + unsigned FixupKind = X86::reloc_riprel_4byte; + + // movq loads are handled with a special relocation form which allows the + // linker to eliminate some loads for GOT references which end up in the + // same linkage unit. + if (MI.getOpcode() == X86::MOV64rm) + FixupKind = X86::reloc_riprel_4byte_movq_load; + + // rip-relative addressing is actually relative to the *next* instruction. + // Since an immediate can follow the mod/rm byte for an instruction, this + // means that we need to bias the immediate field of the instruction with + // the size of the immediate field. If we have this case, add it into the + // expression to emit. + int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0; + + EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), + CurByte, OS, Fixups, -ImmSize); + return; + } + + unsigned BaseRegNo = BaseReg ? GetX86RegNum(Base) : -1U; + + // 16-bit addressing forms of the ModR/M byte have a different encoding for + // the R/M field and are far more limited in which registers can be used. + if (Is16BitMemOperand(MI, Op, STI)) { + if (BaseReg) { + // For 32-bit addressing, the row and column values in Table 2-2 are + // basically the same. It's AX/CX/DX/BX/SP/BP/SI/DI in that order, with + // some special cases. And GetX86RegNum reflects that numbering. + // For 16-bit addressing it's more fun, as shown in the SDM Vol 2A, + // Table 2-1 "16-Bit Addressing Forms with the ModR/M byte". We can only + // use SI/DI/BP/BX, which have "row" values 4-7 in no particular order, + // while values 0-3 indicate the allowed combinations (base+index) of + // those: 0 for BX+SI, 1 for BX+DI, 2 for BP+SI, 3 for BP+DI. + // + // R16Table[] is a lookup from the normal RegNo, to the row values from + // Table 2-1 for 16-bit addressing modes. Where zero means disallowed. + static const unsigned R16Table[] = { 0, 0, 0, 7, 0, 6, 4, 5 }; + unsigned RMfield = R16Table[BaseRegNo]; + + assert(RMfield && "invalid 16-bit base register"); + + if (IndexReg.getReg()) { + unsigned IndexReg16 = R16Table[GetX86RegNum(IndexReg)]; + + assert(IndexReg16 && "invalid 16-bit index register"); + // We must have one of SI/DI (4,5), and one of BP/BX (6,7). + assert(((IndexReg16 ^ RMfield) & 2) && + "invalid 16-bit base/index register combination"); + assert(Scale.getImm() == 1 && + "invalid scale for 16-bit memory reference"); + + // Allow base/index to appear in either order (although GAS doesn't). + if (IndexReg16 & 2) + RMfield = (RMfield & 1) | ((7 - IndexReg16) << 1); + else + RMfield = (IndexReg16 & 1) | ((7 - RMfield) << 1); + } + + if (Disp.isImm() && isDisp8(Disp.getImm())) { + if (Disp.getImm() == 0 && BaseRegNo != N86::EBP) { + // There is no displacement; just the register. + EmitByte(ModRMByte(0, RegOpcodeField, RMfield), CurByte, OS); + return; + } + // Use the [REG]+disp8 form, including for [BP] which cannot be encoded. + EmitByte(ModRMByte(1, RegOpcodeField, RMfield), CurByte, OS); + EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); + return; + } + // This is the [REG]+disp16 case. + EmitByte(ModRMByte(2, RegOpcodeField, RMfield), CurByte, OS); + } else { + // There is no BaseReg; this is the plain [disp16] case. + EmitByte(ModRMByte(0, RegOpcodeField, 6), CurByte, OS); + } + + // Emit 16-bit displacement for plain disp16 or [REG]+disp16 cases. + EmitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, CurByte, OS, Fixups); + return; + } + + // Determine whether a SIB byte is needed. + // If no BaseReg, issue a RIP relative instruction only if the MCE can + // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table + // 2-7) and absolute references. + + if (// The SIB byte must be used if there is an index register. + IndexReg.getReg() == 0 && + // The SIB byte must be used if the base is ESP/RSP/R12, all of which + // encode to an R/M value of 4, which indicates that a SIB byte is + // present. + BaseRegNo != N86::ESP && + // If there is no base register and we're in 64-bit mode, we need a SIB + // byte to emit an addr that is just 'disp32' (the non-RIP relative form). + (!is64BitMode(STI) || BaseReg != 0)) { + + if (BaseReg == 0) { // [disp32] in X86-32 mode + EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS); + EmitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups); + return; + } + + // If the base is not EBP/ESP and there is no displacement, use simple + // indirect register encoding, this handles addresses like [EAX]. The + // encoding for [EBP] with no displacement means [disp32] so we handle it + // by emitting a displacement of 0 below. + if (Disp.isImm() && Disp.getImm() == 0 && BaseRegNo != N86::EBP) { + EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS); + return; + } + + // Otherwise, if the displacement fits in a byte, encode as [REG+disp8]. + if (Disp.isImm()) { + if (!HasEVEX && isDisp8(Disp.getImm())) { + EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); + EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); + return; + } + // Try EVEX compressed 8-bit displacement first; if failed, fall back to + // 32-bit displacement. + int CDisp8 = 0; + if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) { + EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); + EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, + CDisp8 - Disp.getImm()); + return; + } + } + + // Otherwise, emit the most general non-SIB encoding: [REG+disp32] + EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS); + EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), + CurByte, OS, Fixups); + return; + } + + // We need a SIB byte, so start by outputting the ModR/M byte first + assert(IndexReg.getReg() != X86::ESP && + IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!"); + + bool ForceDisp32 = false; + bool ForceDisp8 = false; + int CDisp8 = 0; + int ImmOffset = 0; + if (BaseReg == 0) { + // If there is no base register, we emit the special case SIB byte with + // MOD=0, BASE=5, to JUST get the index, scale, and displacement. + EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS); + ForceDisp32 = true; + } else if (!Disp.isImm()) { + // Emit the normal disp32 encoding. + EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS); + ForceDisp32 = true; + } else if (Disp.getImm() == 0 && + // Base reg can't be anything that ends up with '5' as the base + // reg, it is the magic [*] nomenclature that indicates no base. + BaseRegNo != N86::EBP) { + // Emit no displacement ModR/M byte + EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS); + } else if (!HasEVEX && isDisp8(Disp.getImm())) { + // Emit the disp8 encoding. + EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS); + ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP + } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) { + // Emit the disp8 encoding. + EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS); + ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP + ImmOffset = CDisp8 - Disp.getImm(); + } else { + // Emit the normal disp32 encoding. + EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS); + } + + // Calculate what the SS field value should be... + static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 }; + unsigned SS = SSTable[Scale.getImm()]; + + if (BaseReg == 0) { + // Handle the SIB byte for the case where there is no base, see Intel + // Manual 2A, table 2-7. The displacement has already been output. + unsigned IndexRegNo; + if (IndexReg.getReg()) + IndexRegNo = GetX86RegNum(IndexReg); + else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5) + IndexRegNo = 4; + EmitSIBByte(SS, IndexRegNo, 5, CurByte, OS); + } else { + unsigned IndexRegNo; + if (IndexReg.getReg()) + IndexRegNo = GetX86RegNum(IndexReg); + else + IndexRegNo = 4; // For example [ESP+1*<noreg>+4] + EmitSIBByte(SS, IndexRegNo, GetX86RegNum(Base), CurByte, OS); + } + + // Do we need to output a displacement? + if (ForceDisp8) + EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, ImmOffset); + else if (ForceDisp32 || Disp.getImm() != 0) + EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), + CurByte, OS, Fixups); +} + +/// EmitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix +/// called VEX. +void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, + int MemOperand, const MCInst &MI, + const MCInstrDesc &Desc, + raw_ostream &OS) const { + assert(!(TSFlags & X86II::LOCK) && "Can't have LOCK VEX."); + + uint64_t Encoding = TSFlags & X86II::EncodingMask; + bool HasEVEX_K = TSFlags & X86II::EVEX_K; + bool HasVEX_4V = TSFlags & X86II::VEX_4V; + bool HasVEX_4VOp3 = TSFlags & X86II::VEX_4VOp3; + bool HasMemOp4 = TSFlags & X86II::MemOp4; + bool HasEVEX_RC = TSFlags & X86II::EVEX_RC; + + // VEX_R: opcode externsion equivalent to REX.R in + // 1's complement (inverted) form + // + // 1: Same as REX_R=0 (must be 1 in 32-bit mode) + // 0: Same as REX_R=1 (64 bit mode only) + // + unsigned char VEX_R = 0x1; + unsigned char EVEX_R2 = 0x1; + + // VEX_X: equivalent to REX.X, only used when a + // register is used for index in SIB Byte. + // + // 1: Same as REX.X=0 (must be 1 in 32-bit mode) + // 0: Same as REX.X=1 (64-bit mode only) + unsigned char VEX_X = 0x1; + + // VEX_B: + // + // 1: Same as REX_B=0 (ignored in 32-bit mode) + // 0: Same as REX_B=1 (64 bit mode only) + // + unsigned char VEX_B = 0x1; + + // VEX_W: opcode specific (use like REX.W, or used for + // opcode extension, or ignored, depending on the opcode byte) + unsigned char VEX_W = 0; + + // VEX_5M (VEX m-mmmmm field): + // + // 0b00000: Reserved for future use + // 0b00001: implied 0F leading opcode + // 0b00010: implied 0F 38 leading opcode bytes + // 0b00011: implied 0F 3A leading opcode bytes + // 0b00100-0b11111: Reserved for future use + // 0b01000: XOP map select - 08h instructions with imm byte + // 0b01001: XOP map select - 09h instructions with no imm byte + // 0b01010: XOP map select - 0Ah instructions with imm dword + unsigned char VEX_5M = 0; + + // VEX_4V (VEX vvvv field): a register specifier + // (in 1's complement form) or 1111 if unused. + unsigned char VEX_4V = 0xf; + unsigned char EVEX_V2 = 0x1; + + // VEX_L (Vector Length): + // + // 0: scalar or 128-bit vector + // 1: 256-bit vector + // + unsigned char VEX_L = 0; + unsigned char EVEX_L2 = 0; + + // VEX_PP: opcode extension providing equivalent + // functionality of a SIMD prefix + // + // 0b00: None + // 0b01: 66 + // 0b10: F3 + // 0b11: F2 + // + unsigned char VEX_PP = 0; + + // EVEX_U + unsigned char EVEX_U = 1; // Always '1' so far + + // EVEX_z + unsigned char EVEX_z = 0; + + // EVEX_b + unsigned char EVEX_b = 0; + + // EVEX_rc + unsigned char EVEX_rc = 0; + + // EVEX_aaa + unsigned char EVEX_aaa = 0; + + bool EncodeRC = false; + + if (TSFlags & X86II::VEX_W) + VEX_W = 1; + + if (TSFlags & X86II::VEX_L) + VEX_L = 1; + if (TSFlags & X86II::EVEX_L2) + EVEX_L2 = 1; + + if (HasEVEX_K && (TSFlags & X86II::EVEX_Z)) + EVEX_z = 1; + + if ((TSFlags & X86II::EVEX_B)) + EVEX_b = 1; + + switch (TSFlags & X86II::OpPrefixMask) { + default: break; // VEX_PP already correct + case X86II::PD: VEX_PP = 0x1; break; // 66 + case X86II::XS: VEX_PP = 0x2; break; // F3 + case X86II::XD: VEX_PP = 0x3; break; // F2 + } + + switch (TSFlags & X86II::OpMapMask) { + default: llvm_unreachable("Invalid prefix!"); + case X86II::TB: VEX_5M = 0x1; break; // 0F + case X86II::T8: VEX_5M = 0x2; break; // 0F 38 + case X86II::TA: VEX_5M = 0x3; break; // 0F 3A + case X86II::XOP8: VEX_5M = 0x8; break; + case X86II::XOP9: VEX_5M = 0x9; break; + case X86II::XOPA: VEX_5M = 0xA; break; + } + + // Classify VEX_B, VEX_4V, VEX_R, VEX_X + unsigned NumOps = Desc.getNumOperands(); + unsigned CurOp = X86II::getOperandBias(Desc); + + switch (TSFlags & X86II::FormMask) { + default: llvm_unreachable("Unexpected form in EmitVEXOpcodePrefix!"); + case X86II::RawFrm: + break; + case X86II::MRMDestMem: { + // MRMDestMem instructions forms: + // MemAddr, src1(ModR/M) + // MemAddr, src1(VEX_4V), src2(ModR/M) + // MemAddr, src1(ModR/M), imm8 + // + if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand + + X86::AddrBaseReg).getReg())) + VEX_B = 0x0; + if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand + + X86::AddrIndexReg).getReg())) + VEX_X = 0x0; + if (X86II::is32ExtendedReg(MI.getOperand(MemOperand + + X86::AddrIndexReg).getReg())) + EVEX_V2 = 0x0; + + CurOp += X86::AddrNumOperands; + + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); + + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } + + const MCOperand &MO = MI.getOperand(CurOp); + if (MO.isReg()) { + if (X86II::isX86_64ExtendedReg(MO.getReg())) + VEX_R = 0x0; + if (X86II::is32ExtendedReg(MO.getReg())) + EVEX_R2 = 0x0; + } + break; + } + case X86II::MRMSrcMem: + // MRMSrcMem instructions forms: + // src1(ModR/M), MemAddr + // src1(ModR/M), src2(VEX_4V), MemAddr + // src1(ModR/M), MemAddr, imm8 + // src1(ModR/M), MemAddr, src2(VEX_I8IMM) + // + // FMA4: + // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) + // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_R = 0x0; + if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_R2 = 0x0; + CurOp++; + + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); + + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } + + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) + VEX_B = 0x0; + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemOperand+X86::AddrIndexReg).getReg())) + VEX_X = 0x0; + if (X86II::is32ExtendedReg(MI.getOperand(MemOperand + + X86::AddrIndexReg).getReg())) + EVEX_V2 = 0x0; + + if (HasVEX_4VOp3) + // Instruction format for 4VOp3: + // src1(ModR/M), MemAddr, src3(VEX_4V) + // CurOp points to start of the MemoryOperand, + // it skips TIED_TO operands if exist, then increments past src1. + // CurOp + X86::AddrNumOperands will point to src3. + VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands); + break; + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: { + // MRM[0-9]m instructions forms: + // MemAddr + // src1(VEX_4V), MemAddr + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } + + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); + + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) + VEX_B = 0x0; + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemOperand+X86::AddrIndexReg).getReg())) + VEX_X = 0x0; + break; + } + case X86II::MRMSrcReg: + // MRMSrcReg instructions forms: + // dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) + // dst(ModR/M), src1(ModR/M) + // dst(ModR/M), src1(ModR/M), imm8 + // + // FMA4: + // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) + // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_R = 0x0; + if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_R2 = 0x0; + CurOp++; + + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); + + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } + + if (HasMemOp4) // Skip second register source (encoded in I8IMM) + CurOp++; + + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_B = 0x0; + if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_X = 0x0; + CurOp++; + if (HasVEX_4VOp3) + VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + if (EVEX_b) { + if (HasEVEX_RC) { + unsigned RcOperand = NumOps-1; + assert(RcOperand >= CurOp); + EVEX_rc = MI.getOperand(RcOperand).getImm() & 0x3; + } + EncodeRC = true; + } + break; + case X86II::MRMDestReg: + // MRMDestReg instructions forms: + // dst(ModR/M), src(ModR/M) + // dst(ModR/M), src(ModR/M), imm8 + // dst(ModR/M), src1(VEX_4V), src2(ModR/M) + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_B = 0x0; + if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_X = 0x0; + CurOp++; + + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); + + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } + + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_R = 0x0; + if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_R2 = 0x0; + if (EVEX_b) + EncodeRC = true; + break; + case X86II::MRM0r: case X86II::MRM1r: + case X86II::MRM2r: case X86II::MRM3r: + case X86II::MRM4r: case X86II::MRM5r: + case X86II::MRM6r: case X86II::MRM7r: + // MRM0r-MRM7r instructions forms: + // dst(VEX_4V), src(ModR/M), imm8 + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, CurOp); + if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + EVEX_V2 = 0x0; + CurOp++; + } + if (HasEVEX_K) + EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++); + + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_B = 0x0; + if (X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_X = 0x0; + break; + } + + if (Encoding == X86II::VEX || Encoding == X86II::XOP) { + // VEX opcode prefix can have 2 or 3 bytes + // + // 3 bytes: + // +-----+ +--------------+ +-------------------+ + // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp | + // +-----+ +--------------+ +-------------------+ + // 2 bytes: + // +-----+ +-------------------+ + // | C5h | | R | vvvv | L | pp | + // +-----+ +-------------------+ + // + // XOP uses a similar prefix: + // +-----+ +--------------+ +-------------------+ + // | 8Fh | | RXB | m-mmmm | | W | vvvv | L | pp | + // +-----+ +--------------+ +-------------------+ + unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3); + + // Can we use the 2 byte VEX prefix? + if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) { + EmitByte(0xC5, CurByte, OS); + EmitByte(LastByte | (VEX_R << 7), CurByte, OS); + return; + } + + // 3 byte VEX prefix + EmitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, CurByte, OS); + EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS); + EmitByte(LastByte | (VEX_W << 7), CurByte, OS); + } else { + assert(Encoding == X86II::EVEX && "unknown encoding!"); + // EVEX opcode prefix can have 4 bytes + // + // +-----+ +--------------+ +-------------------+ +------------------------+ + // | 62h | | RXBR' | 00mm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa | + // +-----+ +--------------+ +-------------------+ +------------------------+ + assert((VEX_5M & 0x3) == VEX_5M + && "More than 2 significant bits in VEX.m-mmmm fields for EVEX!"); + + VEX_5M &= 0x3; + + EmitByte(0x62, CurByte, OS); + EmitByte((VEX_R << 7) | + (VEX_X << 6) | + (VEX_B << 5) | + (EVEX_R2 << 4) | + VEX_5M, CurByte, OS); + EmitByte((VEX_W << 7) | + (VEX_4V << 3) | + (EVEX_U << 2) | + VEX_PP, CurByte, OS); + if (EncodeRC) + EmitByte((EVEX_z << 7) | + (EVEX_rc << 5) | + (EVEX_b << 4) | + (EVEX_V2 << 3) | + EVEX_aaa, CurByte, OS); + else + EmitByte((EVEX_z << 7) | + (EVEX_L2 << 6) | + (VEX_L << 5) | + (EVEX_b << 4) | + (EVEX_V2 << 3) | + EVEX_aaa, CurByte, OS); + } +} + +/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64 +/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand +/// size, and 3) use of X86-64 extended registers. +static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, + const MCInstrDesc &Desc) { + unsigned REX = 0; + bool UsesHighByteReg = false; + + if (TSFlags & X86II::REX_W) + REX |= 1 << 3; // set REX.W + + if (MI.getNumOperands() == 0) return REX; + + unsigned NumOps = MI.getNumOperands(); + // FIXME: MCInst should explicitize the two-addrness. + bool isTwoAddr = NumOps > 1 && + Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1; + + // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix. + unsigned i = isTwoAddr ? 1 : 0; + for (; i != NumOps; ++i) { + const MCOperand &MO = MI.getOperand(i); + if (!MO.isReg()) continue; + unsigned Reg = MO.getReg(); + if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH) + UsesHighByteReg = true; + if (!X86II::isX86_64NonExtLowByteReg(Reg)) continue; + // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything + // that returns non-zero. + REX |= 0x40; // REX fixed encoding prefix + break; + } + + switch (TSFlags & X86II::FormMask) { + case X86II::MRMSrcReg: + if (MI.getOperand(0).isReg() && + X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) + REX |= 1 << 2; // set REX.R + i = isTwoAddr ? 2 : 1; + for (; i != NumOps; ++i) { + const MCOperand &MO = MI.getOperand(i); + if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg())) + REX |= 1 << 0; // set REX.B + } + break; + case X86II::MRMSrcMem: { + if (MI.getOperand(0).isReg() && + X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) + REX |= 1 << 2; // set REX.R + unsigned Bit = 0; + i = isTwoAddr ? 2 : 1; + for (; i != NumOps; ++i) { + const MCOperand &MO = MI.getOperand(i); + if (MO.isReg()) { + if (X86II::isX86_64ExtendedReg(MO.getReg())) + REX |= 1 << Bit; // set REX.B (Bit=0) and REX.X (Bit=1) + Bit++; + } + } + break; + } + case X86II::MRMXm: + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: + case X86II::MRMDestMem: { + unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands); + i = isTwoAddr ? 1 : 0; + if (NumOps > e && MI.getOperand(e).isReg() && + X86II::isX86_64ExtendedReg(MI.getOperand(e).getReg())) + REX |= 1 << 2; // set REX.R + unsigned Bit = 0; + for (; i != e; ++i) { + const MCOperand &MO = MI.getOperand(i); + if (MO.isReg()) { + if (X86II::isX86_64ExtendedReg(MO.getReg())) + REX |= 1 << Bit; // REX.B (Bit=0) and REX.X (Bit=1) + Bit++; + } + } + break; + } + default: + if (MI.getOperand(0).isReg() && + X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) + REX |= 1 << 0; // set REX.B + i = isTwoAddr ? 2 : 1; + for (unsigned e = NumOps; i != e; ++i) { + const MCOperand &MO = MI.getOperand(i); + if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg())) + REX |= 1 << 2; // set REX.R + } + break; + } + if (REX && UsesHighByteReg) + report_fatal_error("Cannot encode high byte register in REX-prefixed instruction"); + + return REX; +} + +/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed +void X86MCCodeEmitter::EmitSegmentOverridePrefix(unsigned &CurByte, + unsigned SegOperand, + const MCInst &MI, + raw_ostream &OS) const { + // Check for explicit segment override on memory operand. + switch (MI.getOperand(SegOperand).getReg()) { + default: llvm_unreachable("Unknown segment register!"); + case 0: break; + case X86::CS: EmitByte(0x2E, CurByte, OS); break; + case X86::SS: EmitByte(0x36, CurByte, OS); break; + case X86::DS: EmitByte(0x3E, CurByte, OS); break; + case X86::ES: EmitByte(0x26, CurByte, OS); break; + case X86::FS: EmitByte(0x64, CurByte, OS); break; + case X86::GS: EmitByte(0x65, CurByte, OS); break; + } +} + +/// EmitOpcodePrefix - Emit all instruction prefixes prior to the opcode. +/// +/// MemOperand is the operand # of the start of a memory operand if present. If +/// Not present, it is -1. +void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, + int MemOperand, const MCInst &MI, + const MCInstrDesc &Desc, + const MCSubtargetInfo &STI, + raw_ostream &OS) const { + + // Emit the operand size opcode prefix as needed. + if ((TSFlags & X86II::OpSizeMask) == (is16BitMode(STI) ? X86II::OpSize32 + : X86II::OpSize16)) + EmitByte(0x66, CurByte, OS); + + // Emit the LOCK opcode prefix. + if (TSFlags & X86II::LOCK) + EmitByte(0xF0, CurByte, OS); + + switch (TSFlags & X86II::OpPrefixMask) { + case X86II::PD: // 66 + EmitByte(0x66, CurByte, OS); + break; + case X86II::XS: // F3 + EmitByte(0xF3, CurByte, OS); + break; + case X86II::XD: // F2 + EmitByte(0xF2, CurByte, OS); + break; + } + + // Handle REX prefix. + // FIXME: Can this come before F2 etc to simplify emission? + if (is64BitMode(STI)) { + if (unsigned REX = DetermineREXPrefix(MI, TSFlags, Desc)) + EmitByte(0x40 | REX, CurByte, OS); + } + + // 0x0F escape code must be emitted just before the opcode. + switch (TSFlags & X86II::OpMapMask) { + case X86II::TB: // Two-byte opcode map + case X86II::T8: // 0F 38 + case X86II::TA: // 0F 3A + EmitByte(0x0F, CurByte, OS); + break; + } + + switch (TSFlags & X86II::OpMapMask) { + case X86II::T8: // 0F 38 + EmitByte(0x38, CurByte, OS); + break; + case X86II::TA: // 0F 3A + EmitByte(0x3A, CurByte, OS); + break; + } +} + +void X86MCCodeEmitter:: +encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + unsigned Opcode = MI.getOpcode(); + const MCInstrDesc &Desc = MCII.get(Opcode); + uint64_t TSFlags = Desc.TSFlags; + + // Pseudo instructions don't get encoded. + if ((TSFlags & X86II::FormMask) == X86II::Pseudo) + return; + + unsigned NumOps = Desc.getNumOperands(); + unsigned CurOp = X86II::getOperandBias(Desc); + + // Keep track of the current byte being emitted. + unsigned CurByte = 0; + + // Encoding type for this instruction. + uint64_t Encoding = TSFlags & X86II::EncodingMask; + + // It uses the VEX.VVVV field? + bool HasVEX_4V = TSFlags & X86II::VEX_4V; + bool HasVEX_4VOp3 = TSFlags & X86II::VEX_4VOp3; + bool HasMemOp4 = TSFlags & X86II::MemOp4; + const unsigned MemOp4_I8IMMOperand = 2; + + // It uses the EVEX.aaa field? + bool HasEVEX_K = TSFlags & X86II::EVEX_K; + bool HasEVEX_RC = TSFlags & X86II::EVEX_RC; + + // Determine where the memory operand starts, if present. + int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode); + if (MemoryOperand != -1) MemoryOperand += CurOp; + + // Emit segment override opcode prefix as needed. + if (MemoryOperand >= 0) + EmitSegmentOverridePrefix(CurByte, MemoryOperand+X86::AddrSegmentReg, + MI, OS); + + // Emit the repeat opcode prefix as needed. + if (TSFlags & X86II::REP) + EmitByte(0xF3, CurByte, OS); + + // Emit the address size opcode prefix as needed. + bool need_address_override; + uint64_t AdSize = TSFlags & X86II::AdSizeMask; + if ((is16BitMode(STI) && AdSize == X86II::AdSize32) || + (is32BitMode(STI) && AdSize == X86II::AdSize16) || + (is64BitMode(STI) && AdSize == X86II::AdSize32)) { + need_address_override = true; + } else if (MemoryOperand < 0) { + need_address_override = false; + } else if (is64BitMode(STI)) { + assert(!Is16BitMemOperand(MI, MemoryOperand, STI)); + need_address_override = Is32BitMemOperand(MI, MemoryOperand); + } else if (is32BitMode(STI)) { + assert(!Is64BitMemOperand(MI, MemoryOperand)); + need_address_override = Is16BitMemOperand(MI, MemoryOperand, STI); + } else { + assert(is16BitMode(STI)); + assert(!Is64BitMemOperand(MI, MemoryOperand)); + need_address_override = !Is16BitMemOperand(MI, MemoryOperand, STI); + } + + if (need_address_override) + EmitByte(0x67, CurByte, OS); + + if (Encoding == 0) + EmitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, STI, OS); + else + EmitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS); + + unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags); + + if (TSFlags & X86II::Has3DNow0F0FOpcode) + BaseOpcode = 0x0F; // Weird 3DNow! encoding. + + unsigned SrcRegNum = 0; + switch (TSFlags & X86II::FormMask) { + default: errs() << "FORM: " << (TSFlags & X86II::FormMask) << "\n"; + llvm_unreachable("Unknown FormMask value in X86MCCodeEmitter!"); + case X86II::Pseudo: + llvm_unreachable("Pseudo instruction shouldn't be emitted"); + case X86II::RawFrmDstSrc: { + unsigned siReg = MI.getOperand(1).getReg(); + assert(((siReg == X86::SI && MI.getOperand(0).getReg() == X86::DI) || + (siReg == X86::ESI && MI.getOperand(0).getReg() == X86::EDI) || + (siReg == X86::RSI && MI.getOperand(0).getReg() == X86::RDI)) && + "SI and DI register sizes do not match"); + // Emit segment override opcode prefix as needed (not for %ds). + if (MI.getOperand(2).getReg() != X86::DS) + EmitSegmentOverridePrefix(CurByte, 2, MI, OS); + // Emit AdSize prefix as needed. + if ((!is32BitMode(STI) && siReg == X86::ESI) || + (is32BitMode(STI) && siReg == X86::SI)) + EmitByte(0x67, CurByte, OS); + CurOp += 3; // Consume operands. + EmitByte(BaseOpcode, CurByte, OS); + break; + } + case X86II::RawFrmSrc: { + unsigned siReg = MI.getOperand(0).getReg(); + // Emit segment override opcode prefix as needed (not for %ds). + if (MI.getOperand(1).getReg() != X86::DS) + EmitSegmentOverridePrefix(CurByte, 1, MI, OS); + // Emit AdSize prefix as needed. + if ((!is32BitMode(STI) && siReg == X86::ESI) || + (is32BitMode(STI) && siReg == X86::SI)) + EmitByte(0x67, CurByte, OS); + CurOp += 2; // Consume operands. + EmitByte(BaseOpcode, CurByte, OS); + break; + } + case X86II::RawFrmDst: { + unsigned siReg = MI.getOperand(0).getReg(); + // Emit AdSize prefix as needed. + if ((!is32BitMode(STI) && siReg == X86::EDI) || + (is32BitMode(STI) && siReg == X86::DI)) + EmitByte(0x67, CurByte, OS); + ++CurOp; // Consume operand. + EmitByte(BaseOpcode, CurByte, OS); + break; + } + case X86II::RawFrm: + EmitByte(BaseOpcode, CurByte, OS); + break; + case X86II::RawFrmMemOffs: + // Emit segment override opcode prefix as needed. + EmitSegmentOverridePrefix(CurByte, 1, MI, OS); + EmitByte(BaseOpcode, CurByte, OS); + EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), + X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags), + CurByte, OS, Fixups); + ++CurOp; // skip segment operand + break; + case X86II::RawFrmImm8: + EmitByte(BaseOpcode, CurByte, OS); + EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), + X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags), + CurByte, OS, Fixups); + EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte, + OS, Fixups); + break; + case X86II::RawFrmImm16: + EmitByte(BaseOpcode, CurByte, OS); + EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), + X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags), + CurByte, OS, Fixups); + EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte, + OS, Fixups); + break; + + case X86II::AddRegFrm: + EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS); + break; + + case X86II::MRMDestReg: + EmitByte(BaseOpcode, CurByte, OS); + SrcRegNum = CurOp + 1; + + if (HasEVEX_K) // Skip writemask + SrcRegNum++; + + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) + ++SrcRegNum; + + EmitRegModRMByte(MI.getOperand(CurOp), + GetX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS); + CurOp = SrcRegNum + 1; + break; + + case X86II::MRMDestMem: + EmitByte(BaseOpcode, CurByte, OS); + SrcRegNum = CurOp + X86::AddrNumOperands; + + if (HasEVEX_K) // Skip writemask + SrcRegNum++; + + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) + ++SrcRegNum; + + EmitMemModRMByte(MI, CurOp, + GetX86RegNum(MI.getOperand(SrcRegNum)), + TSFlags, CurByte, OS, Fixups, STI); + CurOp = SrcRegNum + 1; + break; + + case X86II::MRMSrcReg: + EmitByte(BaseOpcode, CurByte, OS); + SrcRegNum = CurOp + 1; + + if (HasEVEX_K) // Skip writemask + SrcRegNum++; + + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) + ++SrcRegNum; + + if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM) + ++SrcRegNum; + + EmitRegModRMByte(MI.getOperand(SrcRegNum), + GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS); + + // 2 operands skipped with HasMemOp4, compensate accordingly + CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1; + if (HasVEX_4VOp3) + ++CurOp; + // do not count the rounding control operand + if (HasEVEX_RC) + NumOps--; + break; + + case X86II::MRMSrcMem: { + int AddrOperands = X86::AddrNumOperands; + unsigned FirstMemOp = CurOp+1; + + if (HasEVEX_K) { // Skip writemask + ++AddrOperands; + ++FirstMemOp; + } + + if (HasVEX_4V) { + ++AddrOperands; + ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). + } + if (HasMemOp4) // Skip second register source (encoded in I8IMM) + ++FirstMemOp; + + EmitByte(BaseOpcode, CurByte, OS); + + EmitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)), + TSFlags, CurByte, OS, Fixups, STI); + CurOp += AddrOperands + 1; + if (HasVEX_4VOp3) + ++CurOp; + break; + } + + case X86II::MRMXr: + case X86II::MRM0r: case X86II::MRM1r: + case X86II::MRM2r: case X86II::MRM3r: + case X86II::MRM4r: case X86II::MRM5r: + case X86II::MRM6r: case X86II::MRM7r: { + if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). + ++CurOp; + if (HasEVEX_K) // Skip writemask + ++CurOp; + EmitByte(BaseOpcode, CurByte, OS); + uint64_t Form = TSFlags & X86II::FormMask; + EmitRegModRMByte(MI.getOperand(CurOp++), + (Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r, + CurByte, OS); + break; + } + + case X86II::MRMXm: + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: { + if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). + ++CurOp; + if (HasEVEX_K) // Skip writemask + ++CurOp; + EmitByte(BaseOpcode, CurByte, OS); + uint64_t Form = TSFlags & X86II::FormMask; + EmitMemModRMByte(MI, CurOp, (Form == X86II::MRMXm) ? 0 : Form-X86II::MRM0m, + TSFlags, CurByte, OS, Fixups, STI); + CurOp += X86::AddrNumOperands; + break; + } + case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: + case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5: + case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8: + case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB: + case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE: + case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1: + case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4: + case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7: + case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA: + case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD: + case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0: + case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3: + case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6: + case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9: + case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC: + case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF: + case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2: + case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5: + case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8: + case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB: + case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE: + case X86II::MRM_FF: + EmitByte(BaseOpcode, CurByte, OS); + + uint64_t Form = TSFlags & X86II::FormMask; + EmitByte(0xC0 + Form - X86II::MRM_C0, CurByte, OS); + break; + } + + // If there is a remaining operand, it must be a trailing immediate. Emit it + // according to the right size for the instruction. Some instructions + // (SSE4a extrq and insertq) have two trailing immediates. + while (CurOp != NumOps && NumOps - CurOp <= 2) { + // The last source register of a 4 operand instruction in AVX is encoded + // in bits[7:4] of a immediate byte. + if (TSFlags & X86II::VEX_I8IMM) { + const MCOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand + : CurOp); + ++CurOp; + unsigned RegNum = GetX86RegNum(MO) << 4; + if (X86II::isX86_64ExtendedReg(MO.getReg())) + RegNum |= 1 << 7; + // If there is an additional 5th operand it must be an immediate, which + // is encoded in bits[3:0] + if (CurOp != NumOps) { + const MCOperand &MIMM = MI.getOperand(CurOp++); + if (MIMM.isImm()) { + unsigned Val = MIMM.getImm(); + assert(Val < 16 && "Immediate operand value out of range"); + RegNum |= Val; + } + } + EmitImmediate(MCOperand::createImm(RegNum), MI.getLoc(), 1, FK_Data_1, + CurByte, OS, Fixups); + } else { + EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), + X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags), + CurByte, OS, Fixups); + } + } + + if (TSFlags & X86II::Has3DNow0F0FOpcode) + EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS); + +#ifndef NDEBUG + // FIXME: Verify. + if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) { + errs() << "Cannot encode all operands of: "; + MI.dump(); + errs() << '\n'; + abort(); + } +#endif +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp new file mode 100644 index 0000000..53a6550 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -0,0 +1,451 @@ +//===-- X86MCTargetDesc.cpp - X86 Target Descriptions ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides X86 specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#include "X86MCTargetDesc.h" +#include "InstPrinter/X86ATTInstPrinter.h" +#include "InstPrinter/X86IntelInstPrinter.h" +#include "X86MCAsmInfo.h" +#include "llvm/ADT/Triple.h" +#include "llvm/MC/MCCodeGenInfo.h" +#include "llvm/MC/MCInstrAnalysis.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MachineLocation.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Host.h" +#include "llvm/Support/TargetRegistry.h" + +#if _MSC_VER +#include <intrin.h> +#endif + +using namespace llvm; + +#define GET_REGINFO_MC_DESC +#include "X86GenRegisterInfo.inc" + +#define GET_INSTRINFO_MC_DESC +#include "X86GenInstrInfo.inc" + +#define GET_SUBTARGETINFO_MC_DESC +#include "X86GenSubtargetInfo.inc" + +std::string X86_MC::ParseX86Triple(const Triple &TT) { + std::string FS; + if (TT.getArch() == Triple::x86_64) + FS = "+64bit-mode,-32bit-mode,-16bit-mode"; + else if (TT.getEnvironment() != Triple::CODE16) + FS = "-64bit-mode,+32bit-mode,-16bit-mode"; + else + FS = "-64bit-mode,-32bit-mode,+16bit-mode"; + + return FS; +} + +unsigned X86_MC::getDwarfRegFlavour(const Triple &TT, bool isEH) { + if (TT.getArch() == Triple::x86_64) + return DWARFFlavour::X86_64; + + if (TT.isOSDarwin()) + return isEH ? DWARFFlavour::X86_32_DarwinEH : DWARFFlavour::X86_32_Generic; + if (TT.isOSCygMing()) + // Unsupported by now, just quick fallback + return DWARFFlavour::X86_32_Generic; + return DWARFFlavour::X86_32_Generic; +} + +void X86_MC::InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI) { + // FIXME: TableGen these. + for (unsigned Reg = X86::NoRegister+1; Reg < X86::NUM_TARGET_REGS; ++Reg) { + unsigned SEH = MRI->getEncodingValue(Reg); + MRI->mapLLVMRegToSEHReg(Reg, SEH); + } +} + +MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT, + StringRef CPU, StringRef FS) { + std::string ArchFS = X86_MC::ParseX86Triple(TT); + if (!FS.empty()) { + if (!ArchFS.empty()) + ArchFS = (Twine(ArchFS) + "," + FS).str(); + else + ArchFS = FS; + } + + std::string CPUName = CPU; + if (CPUName.empty()) + CPUName = "generic"; + + return createX86MCSubtargetInfoImpl(TT, CPUName, ArchFS); +} + +static MCInstrInfo *createX86MCInstrInfo() { + MCInstrInfo *X = new MCInstrInfo(); + InitX86MCInstrInfo(X); + return X; +} + +static MCRegisterInfo *createX86MCRegisterInfo(const Triple &TT) { + unsigned RA = (TT.getArch() == Triple::x86_64) + ? X86::RIP // Should have dwarf #16. + : X86::EIP; // Should have dwarf #8. + + MCRegisterInfo *X = new MCRegisterInfo(); + InitX86MCRegisterInfo(X, RA, X86_MC::getDwarfRegFlavour(TT, false), + X86_MC::getDwarfRegFlavour(TT, true), RA); + X86_MC::InitLLVM2SEHRegisterMapping(X); + return X; +} + +static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, + const Triple &TheTriple) { + bool is64Bit = TheTriple.getArch() == Triple::x86_64; + + MCAsmInfo *MAI; + if (TheTriple.isOSBinFormatMachO()) { + if (is64Bit) + MAI = new X86_64MCAsmInfoDarwin(TheTriple); + else + MAI = new X86MCAsmInfoDarwin(TheTriple); + } else if (TheTriple.isOSBinFormatELF()) { + // Force the use of an ELF container. + MAI = new X86ELFMCAsmInfo(TheTriple); + } else if (TheTriple.isWindowsMSVCEnvironment() || + TheTriple.isWindowsCoreCLREnvironment()) { + MAI = new X86MCAsmInfoMicrosoft(TheTriple); + } else if (TheTriple.isOSCygMing() || + TheTriple.isWindowsItaniumEnvironment()) { + MAI = new X86MCAsmInfoGNUCOFF(TheTriple); + } else { + // The default is ELF. + MAI = new X86ELFMCAsmInfo(TheTriple); + } + + // Initialize initial frame state. + // Calculate amount of bytes used for return address storing + int stackGrowth = is64Bit ? -8 : -4; + + // Initial state of the frame pointer is esp+stackGrowth. + unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP; + MCCFIInstruction Inst = MCCFIInstruction::createDefCfa( + nullptr, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth); + MAI->addInitialFrameState(Inst); + + // Add return address to move list + unsigned InstPtr = is64Bit ? X86::RIP : X86::EIP; + MCCFIInstruction Inst2 = MCCFIInstruction::createOffset( + nullptr, MRI.getDwarfRegNum(InstPtr, true), stackGrowth); + MAI->addInitialFrameState(Inst2); + + return MAI; +} + +static MCCodeGenInfo *createX86MCCodeGenInfo(const Triple &TT, Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OL) { + MCCodeGenInfo *X = new MCCodeGenInfo(); + + bool is64Bit = TT.getArch() == Triple::x86_64; + + if (RM == Reloc::Default) { + // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode. + // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we + // use static relocation model by default. + if (TT.isOSDarwin()) { + if (is64Bit) + RM = Reloc::PIC_; + else + RM = Reloc::DynamicNoPIC; + } else if (TT.isOSWindows() && is64Bit) + RM = Reloc::PIC_; + else + RM = Reloc::Static; + } + + // ELF and X86-64 don't have a distinct DynamicNoPIC model. DynamicNoPIC + // is defined as a model for code which may be used in static or dynamic + // executables but not necessarily a shared library. On X86-32 we just + // compile in -static mode, in x86-64 we use PIC. + if (RM == Reloc::DynamicNoPIC) { + if (is64Bit) + RM = Reloc::PIC_; + else if (!TT.isOSDarwin()) + RM = Reloc::Static; + } + + // If we are on Darwin, disallow static relocation model in X86-64 mode, since + // the Mach-O file format doesn't support it. + if (RM == Reloc::Static && TT.isOSDarwin() && is64Bit) + RM = Reloc::PIC_; + + // For static codegen, if we're not already set, use Small codegen. + if (CM == CodeModel::Default) + CM = CodeModel::Small; + else if (CM == CodeModel::JITDefault) + // 64-bit JIT places everything in the same buffer except external funcs. + CM = is64Bit ? CodeModel::Large : CodeModel::Small; + + X->initMCCodeGenInfo(RM, CM, OL); + return X; +} + +static MCInstPrinter *createX86MCInstPrinter(const Triple &T, + unsigned SyntaxVariant, + const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI) { + if (SyntaxVariant == 0) + return new X86ATTInstPrinter(MAI, MII, MRI); + if (SyntaxVariant == 1) + return new X86IntelInstPrinter(MAI, MII, MRI); + return nullptr; +} + +static MCRelocationInfo *createX86MCRelocationInfo(const Triple &TheTriple, + MCContext &Ctx) { + if (TheTriple.isOSBinFormatMachO() && TheTriple.getArch() == Triple::x86_64) + return createX86_64MachORelocationInfo(Ctx); + else if (TheTriple.isOSBinFormatELF()) + return createX86_64ELFRelocationInfo(Ctx); + // Default to the stock relocation info. + return llvm::createMCRelocationInfo(TheTriple, Ctx); +} + +static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) { + return new MCInstrAnalysis(Info); +} + +// Force static initialization. +extern "C" void LLVMInitializeX86TargetMC() { + for (Target *T : {&TheX86_32Target, &TheX86_64Target}) { + // Register the MC asm info. + RegisterMCAsmInfoFn X(*T, createX86MCAsmInfo); + + // Register the MC codegen info. + RegisterMCCodeGenInfoFn Y(*T, createX86MCCodeGenInfo); + + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(*T, createX86MCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(*T, createX86MCRegisterInfo); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(*T, + X86_MC::createX86MCSubtargetInfo); + + // Register the MC instruction analyzer. + TargetRegistry::RegisterMCInstrAnalysis(*T, createX86MCInstrAnalysis); + + // Register the code emitter. + TargetRegistry::RegisterMCCodeEmitter(*T, createX86MCCodeEmitter); + + // Register the object streamer. + TargetRegistry::RegisterCOFFStreamer(*T, createX86WinCOFFStreamer); + + // Register the MCInstPrinter. + TargetRegistry::RegisterMCInstPrinter(*T, createX86MCInstPrinter); + + // Register the MC relocation info. + TargetRegistry::RegisterMCRelocationInfo(*T, createX86MCRelocationInfo); + } + + // Register the asm backend. + TargetRegistry::RegisterMCAsmBackend(TheX86_32Target, + createX86_32AsmBackend); + TargetRegistry::RegisterMCAsmBackend(TheX86_64Target, + createX86_64AsmBackend); +} + +unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size, + bool High) { + switch (Size) { + default: return 0; + case 8: + if (High) { + switch (Reg) { + default: return getX86SubSuperRegisterOrZero(Reg, 64); + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SP; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AH; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DH; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CH; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BH; + } + } else { + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AL; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DL; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CL; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BL; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SIL; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DIL; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BPL; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SPL; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8B; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9B; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10B; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11B; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12B; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13B; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14B; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15B; + } + } + case 16: + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8W; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9W; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10W; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11W; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12W; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13W; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14W; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15W; + } + case 32: + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::EAX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::EDX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::ECX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::EBX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::ESI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::EDI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::EBP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::ESP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8D; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9D; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10D; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11D; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12D; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13D; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14D; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15D; + } + case 64: + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::RAX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::RDX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::RCX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::RBX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::RSI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::RDI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::RBP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::RSP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15; + } + } +} + +unsigned llvm::getX86SubSuperRegister(unsigned Reg, unsigned Size, bool High) { + unsigned Res = getX86SubSuperRegisterOrZero(Reg, Size, High); + assert(Res != 0 && "Unexpected register or VT"); + return Res; +} + + diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h new file mode 100644 index 0000000..2d2836f --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -0,0 +1,129 @@ +//===-- X86MCTargetDesc.h - X86 Target Descriptions -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides X86 specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H + +#include "llvm/Support/DataTypes.h" +#include <string> + +namespace llvm { +class MCAsmBackend; +class MCCodeEmitter; +class MCContext; +class MCInstrInfo; +class MCObjectWriter; +class MCRegisterInfo; +class MCSubtargetInfo; +class MCRelocationInfo; +class MCStreamer; +class Target; +class Triple; +class StringRef; +class raw_ostream; +class raw_pwrite_stream; + +extern Target TheX86_32Target, TheX86_64Target; + +/// Flavour of dwarf regnumbers +/// +namespace DWARFFlavour { + enum { + X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2 + }; +} + +/// Native X86 register numbers +/// +namespace N86 { + enum { + EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7 + }; +} + +namespace X86_MC { +std::string ParseX86Triple(const Triple &TT); + +unsigned getDwarfRegFlavour(const Triple &TT, bool isEH); + +void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI); + +/// Create a X86 MCSubtargetInfo instance. This is exposed so Asm parser, etc. +/// do not need to go through TargetRegistry. +MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU, + StringRef FS); +} + +MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx); + +MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); +MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); + +/// Construct an X86 Windows COFF machine code streamer which will generate +/// PE/COFF format object files. +/// +/// Takes ownership of \p AB and \p CE. +MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, + raw_pwrite_stream &OS, MCCodeEmitter *CE, + bool RelaxAll, bool IncrementalLinkerCompatible); + +/// Construct an X86 Mach-O object writer. +MCObjectWriter *createX86MachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit, + uint32_t CPUType, + uint32_t CPUSubtype); + +/// Construct an X86 ELF object writer. +MCObjectWriter *createX86ELFObjectWriter(raw_pwrite_stream &OS, bool IsELF64, + uint8_t OSABI, uint16_t EMachine); +/// Construct an X86 Win COFF object writer. +MCObjectWriter *createX86WinCOFFObjectWriter(raw_pwrite_stream &OS, + bool Is64Bit); + +/// Construct X86-64 Mach-O relocation info. +MCRelocationInfo *createX86_64MachORelocationInfo(MCContext &Ctx); + +/// Construct X86-64 ELF relocation info. +MCRelocationInfo *createX86_64ELFRelocationInfo(MCContext &Ctx); + +/// Returns the sub or super register of a specific X86 register. +/// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX. +/// Aborts on error. +unsigned getX86SubSuperRegister(unsigned, unsigned, bool High=false); + +/// Returns the sub or super register of a specific X86 register. +/// Like getX86SubSuperRegister() but returns 0 on error. +unsigned getX86SubSuperRegisterOrZero(unsigned, unsigned, + bool High = false); + +} // End llvm namespace + + +// Defines symbolic names for X86 registers. This defines a mapping from +// register name to register number. +// +#define GET_REGINFO_ENUM +#include "X86GenRegisterInfo.inc" + +// Defines symbolic names for the X86 instructions. +// +#define GET_INSTRINFO_ENUM +#include "X86GenInstrInfo.inc" + +#define GET_SUBTARGETINFO_ENUM +#include "X86GenSubtargetInfo.inc" + +#endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp new file mode 100644 index 0000000..9bfe999 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp @@ -0,0 +1,119 @@ +//===-- X86MachORelocationInfo.cpp ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCRelocationInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Object/MachO.h" + +using namespace llvm; +using namespace object; +using namespace MachO; + +namespace { +class X86_64MachORelocationInfo : public MCRelocationInfo { +public: + X86_64MachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {} + + const MCExpr *createExprForRelocation(RelocationRef Rel) override { + const MachOObjectFile *Obj = cast<MachOObjectFile>(Rel.getObject()); + + uint64_t RelType = Rel.getType(); + symbol_iterator SymI = Rel.getSymbol(); + + ErrorOr<StringRef> SymNameOrErr = SymI->getName(); + if (std::error_code EC = SymNameOrErr.getError()) + report_fatal_error(EC.message()); + StringRef SymName = *SymNameOrErr; + uint64_t SymAddr = SymI->getValue(); + + any_relocation_info RE = Obj->getRelocation(Rel.getRawDataRefImpl()); + bool isPCRel = Obj->getAnyRelocationPCRel(RE); + + MCSymbol *Sym = Ctx.getOrCreateSymbol(SymName); + // FIXME: check that the value is actually the same. + if (!Sym->isVariable()) + Sym->setVariableValue(MCConstantExpr::create(SymAddr, Ctx)); + const MCExpr *Expr = nullptr; + + switch(RelType) { + case X86_64_RELOC_TLV: + Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx); + break; + case X86_64_RELOC_SIGNED_4: + Expr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Sym, Ctx), + MCConstantExpr::create(4, Ctx), + Ctx); + break; + case X86_64_RELOC_SIGNED_2: + Expr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Sym, Ctx), + MCConstantExpr::create(2, Ctx), + Ctx); + break; + case X86_64_RELOC_SIGNED_1: + Expr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Sym, Ctx), + MCConstantExpr::create(1, Ctx), + Ctx); + break; + case X86_64_RELOC_GOT_LOAD: + Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx); + break; + case X86_64_RELOC_GOT: + Expr = MCSymbolRefExpr::create(Sym, isPCRel ? + MCSymbolRefExpr::VK_GOTPCREL : + MCSymbolRefExpr::VK_GOT, + Ctx); + break; + case X86_64_RELOC_SUBTRACTOR: + { + Rel.moveNext(); + any_relocation_info RENext = + Obj->getRelocation(Rel.getRawDataRefImpl()); + + // X86_64_SUBTRACTOR must be followed by a relocation of type + // X86_64_RELOC_UNSIGNED. + // NOTE: Scattered relocations don't exist on x86_64. + unsigned RType = Obj->getAnyRelocationType(RENext); + if (RType != X86_64_RELOC_UNSIGNED) + report_fatal_error("Expected X86_64_RELOC_UNSIGNED after " + "X86_64_RELOC_SUBTRACTOR."); + + const MCExpr *LHS = MCSymbolRefExpr::create(Sym, Ctx); + + symbol_iterator RSymI = Rel.getSymbol(); + uint64_t RSymAddr = RSymI->getValue(); + ErrorOr<StringRef> RSymName = RSymI->getName(); + if (std::error_code EC = RSymName.getError()) + report_fatal_error(EC.message()); + + MCSymbol *RSym = Ctx.getOrCreateSymbol(*RSymName); + if (!RSym->isVariable()) + RSym->setVariableValue(MCConstantExpr::create(RSymAddr, Ctx)); + + const MCExpr *RHS = MCSymbolRefExpr::create(RSym, Ctx); + + Expr = MCBinaryExpr::createSub(LHS, RHS, Ctx); + break; + } + default: + Expr = MCSymbolRefExpr::create(Sym, Ctx); + break; + } + return Expr; + } +}; +} // End unnamed namespace + +/// createX86_64MachORelocationInfo - Construct an X86-64 Mach-O RelocationInfo. +MCRelocationInfo *llvm::createX86_64MachORelocationInfo(MCContext &Ctx) { + return new X86_64MachORelocationInfo(Ctx); +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp new file mode 100644 index 0000000..191ebea --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -0,0 +1,605 @@ +//===-- X86MachObjectWriter.cpp - X86 Mach-O Writer -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "MCTargetDesc/X86FixupKinds.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAsmLayout.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/MachO.h" + +using namespace llvm; + +namespace { +class X86MachObjectWriter : public MCMachObjectTargetWriter { + bool recordScatteredRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + unsigned Log2Size, + uint64_t &FixedValue); + void recordTLVPRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + uint64_t &FixedValue); + + void RecordX86Relocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + uint64_t &FixedValue); + void RecordX86_64Relocation(MachObjectWriter *Writer, MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, const MCFixup &Fixup, + MCValue Target, uint64_t &FixedValue); + +public: + X86MachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype) + : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {} + + void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm, + const MCAsmLayout &Layout, const MCFragment *Fragment, + const MCFixup &Fixup, MCValue Target, + uint64_t &FixedValue) override { + if (Writer->is64Bit()) + RecordX86_64Relocation(Writer, Asm, Layout, Fragment, Fixup, Target, + FixedValue); + else + RecordX86Relocation(Writer, Asm, Layout, Fragment, Fixup, Target, + FixedValue); + } +}; +} + +static bool isFixupKindRIPRel(unsigned Kind) { + return Kind == X86::reloc_riprel_4byte || + Kind == X86::reloc_riprel_4byte_movq_load; +} + +static unsigned getFixupKindLog2Size(unsigned Kind) { + switch (Kind) { + default: + llvm_unreachable("invalid fixup kind!"); + case FK_PCRel_1: + case FK_Data_1: return 0; + case FK_PCRel_2: + case FK_Data_2: return 1; + case FK_PCRel_4: + // FIXME: Remove these!!! + case X86::reloc_riprel_4byte: + case X86::reloc_riprel_4byte_movq_load: + case X86::reloc_signed_4byte: + case FK_Data_4: return 2; + case FK_Data_8: return 3; + } +} + +void X86MachObjectWriter::RecordX86_64Relocation( + MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, + uint64_t &FixedValue) { + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); + unsigned IsRIPRel = isFixupKindRIPRel(Fixup.getKind()); + unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind()); + + // See <reloc.h>. + uint32_t FixupOffset = + Layout.getFragmentOffset(Fragment) + Fixup.getOffset(); + uint32_t FixupAddress = + Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset(); + int64_t Value = 0; + unsigned Index = 0; + unsigned IsExtern = 0; + unsigned Type = 0; + const MCSymbol *RelSymbol = nullptr; + + Value = Target.getConstant(); + + if (IsPCRel) { + // Compensate for the relocation offset, Darwin x86_64 relocations only have + // the addend and appear to have attempted to define it to be the actual + // expression addend without the PCrel bias. However, instructions with data + // following the relocation are not accommodated for (see comment below + // regarding SIGNED{1,2,4}), so it isn't exactly that either. + Value += 1LL << Log2Size; + } + + if (Target.isAbsolute()) { // constant + // SymbolNum of 0 indicates the absolute section. + Type = MachO::X86_64_RELOC_UNSIGNED; + + // FIXME: I believe this is broken, I don't think the linker can understand + // it. I think it would require a local relocation, but I'm not sure if that + // would work either. The official way to get an absolute PCrel relocation + // is to use an absolute symbol (which we don't support yet). + if (IsPCRel) { + IsExtern = 1; + Type = MachO::X86_64_RELOC_BRANCH; + } + } else if (Target.getSymB()) { // A - B + constant + const MCSymbol *A = &Target.getSymA()->getSymbol(); + if (A->isTemporary()) + A = &Writer->findAliasedSymbol(*A); + const MCSymbol *A_Base = Asm.getAtom(*A); + + const MCSymbol *B = &Target.getSymB()->getSymbol(); + if (B->isTemporary()) + B = &Writer->findAliasedSymbol(*B); + const MCSymbol *B_Base = Asm.getAtom(*B); + + // Neither symbol can be modified. + if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None || + Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) { + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported relocation of modified symbol"); + return; + } + + // We don't support PCrel relocations of differences. Darwin 'as' doesn't + // implement most of these correctly. + if (IsPCRel) { + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported pc-relative relocation of difference"); + return; + } + + // The support for the situation where one or both of the symbols would + // require a local relocation is handled just like if the symbols were + // external. This is certainly used in the case of debug sections where the + // section has only temporary symbols and thus the symbols don't have base + // symbols. This is encoded using the section ordinal and non-extern + // relocation entries. + + // Darwin 'as' doesn't emit correct relocations for this (it ends up with a + // single SIGNED relocation); reject it for now. Except the case where both + // symbols don't have a base, equal but both NULL. + if (A_Base == B_Base && A_Base) { + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported relocation with identical base"); + return; + } + + // A subtraction expression where either symbol is undefined is a + // non-relocatable expression. + if (A->isUndefined() || B->isUndefined()) { + StringRef Name = A->isUndefined() ? A->getName() : B->getName(); + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported relocation with subtraction expression, symbol '" + + Name + "' can not be undefined in a subtraction expression"); + return; + } + + Value += Writer->getSymbolAddress(*A, Layout) - + (!A_Base ? 0 : Writer->getSymbolAddress(*A_Base, Layout)); + Value -= Writer->getSymbolAddress(*B, Layout) - + (!B_Base ? 0 : Writer->getSymbolAddress(*B_Base, Layout)); + + if (!A_Base) + Index = A->getFragment()->getParent()->getOrdinal() + 1; + Type = MachO::X86_64_RELOC_UNSIGNED; + + MachO::any_relocation_info MRE; + MRE.r_word0 = FixupOffset; + MRE.r_word1 = + (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); + Writer->addRelocation(A_Base, Fragment->getParent(), MRE); + + if (B_Base) + RelSymbol = B_Base; + else + Index = B->getFragment()->getParent()->getOrdinal() + 1; + Type = MachO::X86_64_RELOC_SUBTRACTOR; + } else { + const MCSymbol *Symbol = &Target.getSymA()->getSymbol(); + if (Symbol->isTemporary() && Value) { + const MCSection &Sec = Symbol->getSection(); + if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec)) + Symbol->setUsedInReloc(); + } + RelSymbol = Asm.getAtom(*Symbol); + + // Relocations inside debug sections always use local relocations when + // possible. This seems to be done because the debugger doesn't fully + // understand x86_64 relocation entries, and expects to find values that + // have already been fixed up. + if (Symbol->isInSection()) { + const MCSectionMachO &Section = + static_cast<const MCSectionMachO &>(*Fragment->getParent()); + if (Section.hasAttribute(MachO::S_ATTR_DEBUG)) + RelSymbol = nullptr; + } + + // x86_64 almost always uses external relocations, except when there is no + // symbol to use as a base address (a local symbol with no preceding + // non-local symbol). + if (RelSymbol) { + // Add the local offset, if needed. + if (RelSymbol != Symbol) + Value += Layout.getSymbolOffset(*Symbol) - + Layout.getSymbolOffset(*RelSymbol); + } else if (Symbol->isInSection() && !Symbol->isVariable()) { + // The index is the section ordinal (1-based). + Index = Symbol->getFragment()->getParent()->getOrdinal() + 1; + Value += Writer->getSymbolAddress(*Symbol, Layout); + + if (IsPCRel) + Value -= FixupAddress + (1 << Log2Size); + } else if (Symbol->isVariable()) { + const MCExpr *Value = Symbol->getVariableValue(); + int64_t Res; + bool isAbs = Value->evaluateAsAbsolute(Res, Layout, + Writer->getSectionAddressMap()); + if (isAbs) { + FixedValue = Res; + return; + } else { + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported relocation of variable '" + + Symbol->getName() + "'"); + return; + } + } else { + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported relocation of undefined symbol '" + + Symbol->getName() + "'"); + return; + } + + MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind(); + if (IsPCRel) { + if (IsRIPRel) { + if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) { + // x86_64 distinguishes movq foo@GOTPCREL so that the linker can + // rewrite the movq to an leaq at link time if the symbol ends up in + // the same linkage unit. + if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load) + Type = MachO::X86_64_RELOC_GOT_LOAD; + else + Type = MachO::X86_64_RELOC_GOT; + } else if (Modifier == MCSymbolRefExpr::VK_TLVP) { + Type = MachO::X86_64_RELOC_TLV; + } else if (Modifier != MCSymbolRefExpr::VK_None) { + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported symbol modifier in relocation"); + return; + } else { + Type = MachO::X86_64_RELOC_SIGNED; + + // The Darwin x86_64 relocation format has a problem where it cannot + // encode an address (L<foo> + <constant>) which is outside the atom + // containing L<foo>. Generally, this shouldn't occur but it does + // happen when we have a RIPrel instruction with data following the + // relocation entry (e.g., movb $012, L0(%rip)). Even with the PCrel + // adjustment Darwin x86_64 uses, the offset is still negative and the + // linker has no way to recognize this. + // + // To work around this, Darwin uses several special relocation types + // to indicate the offsets. However, the specification or + // implementation of these seems to also be incomplete; they should + // adjust the addend as well based on the actual encoded instruction + // (the additional bias), but instead appear to just look at the final + // offset. + switch (-(Target.getConstant() + (1LL << Log2Size))) { + case 1: Type = MachO::X86_64_RELOC_SIGNED_1; break; + case 2: Type = MachO::X86_64_RELOC_SIGNED_2; break; + case 4: Type = MachO::X86_64_RELOC_SIGNED_4; break; + } + } + } else { + if (Modifier != MCSymbolRefExpr::VK_None) { + Asm.getContext().reportError( + Fixup.getLoc(), + "unsupported symbol modifier in branch relocation"); + return; + } + + Type = MachO::X86_64_RELOC_BRANCH; + } + } else { + if (Modifier == MCSymbolRefExpr::VK_GOT) { + Type = MachO::X86_64_RELOC_GOT; + } else if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) { + // GOTPCREL is allowed as a modifier on non-PCrel instructions, in which + // case all we do is set the PCrel bit in the relocation entry; this is + // used with exception handling, for example. The source is required to + // include any necessary offset directly. + Type = MachO::X86_64_RELOC_GOT; + IsPCRel = 1; + } else if (Modifier == MCSymbolRefExpr::VK_TLVP) { + Asm.getContext().reportError( + Fixup.getLoc(), "TLVP symbol modifier should have been rip-rel"); + return; + } else if (Modifier != MCSymbolRefExpr::VK_None) { + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported symbol modifier in relocation"); + return; + } else { + Type = MachO::X86_64_RELOC_UNSIGNED; + unsigned Kind = Fixup.getKind(); + if (Kind == X86::reloc_signed_4byte) { + Asm.getContext().reportError( + Fixup.getLoc(), + "32-bit absolute addressing is not supported in 64-bit mode"); + return; + } + } + } + } + + // x86_64 always writes custom values into the fixups. + FixedValue = Value; + + // struct relocation_info (8 bytes) + MachO::any_relocation_info MRE; + MRE.r_word0 = FixupOffset; + MRE.r_word1 = (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | + (IsExtern << 27) | (Type << 28); + Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE); +} + +bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + unsigned Log2Size, + uint64_t &FixedValue) { + uint64_t OriginalFixedValue = FixedValue; + uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); + unsigned Type = MachO::GENERIC_RELOC_VANILLA; + + // See <reloc.h>. + const MCSymbol *A = &Target.getSymA()->getSymbol(); + + if (!A->getFragment()) { + Asm.getContext().reportError( + Fixup.getLoc(), + "symbol '" + A->getName() + + "' can not be undefined in a subtraction expression"); + return false; + } + + uint32_t Value = Writer->getSymbolAddress(*A, Layout); + uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent()); + FixedValue += SecAddr; + uint32_t Value2 = 0; + + if (const MCSymbolRefExpr *B = Target.getSymB()) { + const MCSymbol *SB = &B->getSymbol(); + + if (!SB->getFragment()) { + Asm.getContext().reportError( + Fixup.getLoc(), + "symbol '" + B->getSymbol().getName() + + "' can not be undefined in a subtraction expression"); + return false; + } + + // Select the appropriate difference relocation type. + // + // Note that there is no longer any semantic difference between these two + // relocation types from the linkers point of view, this is done solely for + // pedantic compatibility with 'as'. + Type = A->isExternal() ? (unsigned)MachO::GENERIC_RELOC_SECTDIFF + : (unsigned)MachO::GENERIC_RELOC_LOCAL_SECTDIFF; + Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout); + FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent()); + } + + // Relocations are written out in reverse order, so the PAIR comes first. + if (Type == MachO::GENERIC_RELOC_SECTDIFF || + Type == MachO::GENERIC_RELOC_LOCAL_SECTDIFF) { + // If the offset is too large to fit in a scattered relocation, + // we're hosed. It's an unfortunate limitation of the MachO format. + if (FixupOffset > 0xffffff) { + char Buffer[32]; + format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer)); + Asm.getContext().reportError(Fixup.getLoc(), + Twine("Section too large, can't encode " + "r_address (") + Buffer + + ") into 24 bits of scattered " + "relocation entry."); + return false; + } + + MachO::any_relocation_info MRE; + MRE.r_word0 = ((0 << 0) | // r_address + (MachO::GENERIC_RELOC_PAIR << 24) | // r_type + (Log2Size << 28) | + (IsPCRel << 30) | + MachO::R_SCATTERED); + MRE.r_word1 = Value2; + Writer->addRelocation(nullptr, Fragment->getParent(), MRE); + } else { + // If the offset is more than 24-bits, it won't fit in a scattered + // relocation offset field, so we fall back to using a non-scattered + // relocation. This is a bit risky, as if the offset reaches out of + // the block and the linker is doing scattered loading on this + // symbol, things can go badly. + // + // Required for 'as' compatibility. + if (FixupOffset > 0xffffff) { + FixedValue = OriginalFixedValue; + return false; + } + } + + MachO::any_relocation_info MRE; + MRE.r_word0 = ((FixupOffset << 0) | + (Type << 24) | + (Log2Size << 28) | + (IsPCRel << 30) | + MachO::R_SCATTERED); + MRE.r_word1 = Value; + Writer->addRelocation(nullptr, Fragment->getParent(), MRE); + return true; +} + +void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + uint64_t &FixedValue) { + assert(Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP && + !is64Bit() && + "Should only be called with a 32-bit TLVP relocation!"); + + unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind()); + uint32_t Value = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); + unsigned IsPCRel = 0; + + // We're only going to have a second symbol in pic mode and it'll be a + // subtraction from the picbase. For 32-bit pic the addend is the difference + // between the picbase and the next address. For 32-bit static the addend is + // zero. + if (Target.getSymB()) { + // If this is a subtraction then we're pcrel. + uint32_t FixupAddress = + Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset(); + IsPCRel = 1; + FixedValue = + FixupAddress - + Writer->getSymbolAddress(Target.getSymB()->getSymbol(), Layout) + + Target.getConstant(); + FixedValue += 1ULL << Log2Size; + } else { + FixedValue = 0; + } + + // struct relocation_info (8 bytes) + MachO::any_relocation_info MRE; + MRE.r_word0 = Value; + MRE.r_word1 = + (IsPCRel << 24) | (Log2Size << 25) | (MachO::GENERIC_RELOC_TLV << 28); + Writer->addRelocation(&Target.getSymA()->getSymbol(), Fragment->getParent(), + MRE); +} + +void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + uint64_t &FixedValue) { + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); + unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind()); + + // If this is a 32-bit TLVP reloc it's handled a bit differently. + if (Target.getSymA() && + Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP) { + recordTLVPRelocation(Writer, Asm, Layout, Fragment, Fixup, Target, + FixedValue); + return; + } + + // If this is a difference or a defined symbol plus an offset, then we need a + // scattered relocation entry. Differences always require scattered + // relocations. + if (Target.getSymB()) { + recordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, + Target, Log2Size, FixedValue); + return; + } + + // Get the symbol data, if any. + const MCSymbol *A = nullptr; + if (Target.getSymA()) + A = &Target.getSymA()->getSymbol(); + + // If this is an internal relocation with an offset, it also needs a scattered + // relocation entry. + uint32_t Offset = Target.getConstant(); + if (IsPCRel) + Offset += 1 << Log2Size; + // Try to record the scattered relocation if needed. Fall back to non + // scattered if necessary (see comments in recordScatteredRelocation() + // for details). + if (Offset && A && !Writer->doesSymbolRequireExternRelocation(*A) && + recordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target, + Log2Size, FixedValue)) + return; + + // See <reloc.h>. + uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); + unsigned Index = 0; + unsigned Type = 0; + const MCSymbol *RelSymbol = nullptr; + + if (Target.isAbsolute()) { // constant + // SymbolNum of 0 indicates the absolute section. + // + // FIXME: Currently, these are never generated (see code below). I cannot + // find a case where they are actually emitted. + Type = MachO::GENERIC_RELOC_VANILLA; + } else { + // Resolve constant variables. + if (A->isVariable()) { + int64_t Res; + if (A->getVariableValue()->evaluateAsAbsolute( + Res, Layout, Writer->getSectionAddressMap())) { + FixedValue = Res; + return; + } + } + + // Check whether we need an external or internal relocation. + if (Writer->doesSymbolRequireExternRelocation(*A)) { + RelSymbol = A; + // For external relocations, make sure to offset the fixup value to + // compensate for the addend of the symbol address, if it was + // undefined. This occurs with weak definitions, for example. + if (!A->isUndefined()) + FixedValue -= Layout.getSymbolOffset(*A); + } else { + // The index is the section ordinal (1-based). + const MCSection &Sec = A->getSection(); + Index = Sec.getOrdinal() + 1; + FixedValue += Writer->getSectionAddress(&Sec); + } + if (IsPCRel) + FixedValue -= Writer->getSectionAddress(Fragment->getParent()); + + Type = MachO::GENERIC_RELOC_VANILLA; + } + + // struct relocation_info (8 bytes) + MachO::any_relocation_info MRE; + MRE.r_word0 = FixupOffset; + MRE.r_word1 = + (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); + Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE); +} + +MCObjectWriter *llvm::createX86MachObjectWriter(raw_pwrite_stream &OS, + bool Is64Bit, uint32_t CPUType, + uint32_t CPUSubtype) { + return createMachObjectWriter(new X86MachObjectWriter(Is64Bit, + CPUType, + CPUSubtype), + OS, /*IsLittleEndian=*/true); +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp new file mode 100644 index 0000000..bd1bc99 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -0,0 +1,97 @@ +//===-- X86WinCOFFObjectWriter.cpp - X86 Win COFF Writer ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86FixupKinds.h" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCValue.h" +#include "llvm/MC/MCWinCOFFObjectWriter.h" +#include "llvm/Support/COFF.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +namespace llvm { + class MCObjectWriter; +} + +namespace { + class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter { + public: + X86WinCOFFObjectWriter(bool Is64Bit); + ~X86WinCOFFObjectWriter() override; + + unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup, + bool IsCrossSection, + const MCAsmBackend &MAB) const override; + }; +} + +X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit) + : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64 + : COFF::IMAGE_FILE_MACHINE_I386) {} + +X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {} + +unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target, + const MCFixup &Fixup, + bool IsCrossSection, + const MCAsmBackend &MAB) const { + unsigned FixupKind = IsCrossSection ? FK_PCRel_4 : Fixup.getKind(); + + MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ? + MCSymbolRefExpr::VK_None : Target.getSymA()->getKind(); + + if (getMachine() == COFF::IMAGE_FILE_MACHINE_AMD64) { + switch (FixupKind) { + case FK_PCRel_4: + case X86::reloc_riprel_4byte: + case X86::reloc_riprel_4byte_movq_load: + return COFF::IMAGE_REL_AMD64_REL32; + case FK_Data_4: + case X86::reloc_signed_4byte: + if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32) + return COFF::IMAGE_REL_AMD64_ADDR32NB; + return COFF::IMAGE_REL_AMD64_ADDR32; + case FK_Data_8: + return COFF::IMAGE_REL_AMD64_ADDR64; + case FK_SecRel_2: + return COFF::IMAGE_REL_AMD64_SECTION; + case FK_SecRel_4: + return COFF::IMAGE_REL_AMD64_SECREL; + default: + llvm_unreachable("unsupported relocation type"); + } + } else if (getMachine() == COFF::IMAGE_FILE_MACHINE_I386) { + switch (FixupKind) { + case FK_PCRel_4: + case X86::reloc_riprel_4byte: + case X86::reloc_riprel_4byte_movq_load: + return COFF::IMAGE_REL_I386_REL32; + case FK_Data_4: + case X86::reloc_signed_4byte: + if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32) + return COFF::IMAGE_REL_I386_DIR32NB; + return COFF::IMAGE_REL_I386_DIR32; + case FK_SecRel_2: + return COFF::IMAGE_REL_I386_SECTION; + case FK_SecRel_4: + return COFF::IMAGE_REL_I386_SECREL; + default: + llvm_unreachable("unsupported relocation type"); + } + } else + llvm_unreachable("Unsupported COFF machine type."); +} + +MCObjectWriter *llvm::createX86WinCOFFObjectWriter(raw_pwrite_stream &OS, + bool Is64Bit) { + MCWinCOFFObjectTargetWriter *MOTW = new X86WinCOFFObjectWriter(Is64Bit); + return createWinCOFFObjectWriter(MOTW, OS); +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp new file mode 100644 index 0000000..d045118 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -0,0 +1,60 @@ +//===-- X86WinCOFFStreamer.cpp - X86 Target WinCOFF Streamer ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "X86MCTargetDesc.h" +#include "llvm/MC/MCWin64EH.h" +#include "llvm/MC/MCWinCOFFStreamer.h" + +using namespace llvm; + +namespace { +class X86WinCOFFStreamer : public MCWinCOFFStreamer { + Win64EH::UnwindEmitter EHStreamer; +public: + X86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter *CE, + raw_pwrite_stream &OS) + : MCWinCOFFStreamer(C, AB, *CE, OS) {} + + void EmitWinEHHandlerData() override; + void EmitWindowsUnwindTables() override; + void FinishImpl() override; +}; + +void X86WinCOFFStreamer::EmitWinEHHandlerData() { + MCStreamer::EmitWinEHHandlerData(); + + // We have to emit the unwind info now, because this directive + // actually switches to the .xdata section! + EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo()); +} + +void X86WinCOFFStreamer::EmitWindowsUnwindTables() { + if (!getNumWinFrameInfos()) + return; + EHStreamer.Emit(*this); +} + +void X86WinCOFFStreamer::FinishImpl() { + EmitFrames(nullptr); + EmitWindowsUnwindTables(); + + MCWinCOFFStreamer::FinishImpl(); +} +} + +MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, + raw_pwrite_stream &OS, + MCCodeEmitter *CE, bool RelaxAll, + bool IncrementalLinkerCompatible) { + X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS); + S->getAssembler().setRelaxAll(RelaxAll); + S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible); + return S; +} + diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp new file mode 100644 index 0000000..fceb083 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp @@ -0,0 +1,22 @@ +//===-- X86TargetInfo.cpp - X86 Target Implementation ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/Support/TargetRegistry.h" +using namespace llvm; + +Target llvm::TheX86_32Target, llvm::TheX86_64Target; + +extern "C" void LLVMInitializeX86TargetInfo() { + RegisterTarget<Triple::x86, /*HasJIT=*/true> + X(TheX86_32Target, "x86", "32-bit X86: Pentium-Pro and above"); + + RegisterTarget<Triple::x86_64, /*HasJIT=*/true> + Y(TheX86_64Target, "x86-64", "64-bit X86: EM64T and AMD64"); +} diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp new file mode 100644 index 0000000..619f7c8 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -0,0 +1,464 @@ +//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Define several functions to decode x86 specific shuffle semantics into a +// generic vector mask. +// +//===----------------------------------------------------------------------===// + +#include "X86ShuffleDecode.h" +#include "llvm/CodeGen/MachineValueType.h" + +//===----------------------------------------------------------------------===// +// Vector Mask Decoding +//===----------------------------------------------------------------------===// + +namespace llvm { + +void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + // Defaults the copying the dest value. + ShuffleMask.push_back(0); + ShuffleMask.push_back(1); + ShuffleMask.push_back(2); + ShuffleMask.push_back(3); + + // Decode the immediate. + unsigned ZMask = Imm & 15; + unsigned CountD = (Imm >> 4) & 3; + unsigned CountS = (Imm >> 6) & 3; + + // CountS selects which input element to use. + unsigned InVal = 4 + CountS; + // CountD specifies which element of destination to update. + ShuffleMask[CountD] = InVal; + // ZMask zaps values, potentially overriding the CountD elt. + if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero; + if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero; + if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero; + if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero; +} + +// <3,1> or <6,7,2,3> +void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) { + for (unsigned i = NElts / 2; i != NElts; ++i) + ShuffleMask.push_back(NElts + i); + + for (unsigned i = NElts / 2; i != NElts; ++i) + ShuffleMask.push_back(i); +} + +// <0,2> or <0,1,4,5> +void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) { + for (unsigned i = 0; i != NElts / 2; ++i) + ShuffleMask.push_back(i); + + for (unsigned i = 0; i != NElts / 2; ++i) + ShuffleMask.push_back(NElts + i); +} + +void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + for (int i = 0, e = NumElts / 2; i < e; ++i) { + ShuffleMask.push_back(2 * i); + ShuffleMask.push_back(2 * i); + } +} + +void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + for (int i = 0, e = NumElts / 2; i < e; ++i) { + ShuffleMask.push_back(2 * i + 1); + ShuffleMask.push_back(2 * i + 1); + } +} + +void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { + unsigned VectorSizeInBits = VT.getSizeInBits(); + unsigned ScalarSizeInBits = VT.getScalarSizeInBits(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VectorSizeInBits / 128; + unsigned NumLaneElts = NumElts / NumLanes; + unsigned NumLaneSubElts = 64 / ScalarSizeInBits; + + for (unsigned l = 0; l < NumElts; l += NumLaneElts) + for (unsigned i = 0; i < NumLaneElts; i += NumLaneSubElts) + for (unsigned s = 0; s != NumLaneSubElts; s++) + ShuffleMask.push_back(l + s); +} + +void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + unsigned VectorSizeInBits = VT.getSizeInBits(); + unsigned NumElts = VectorSizeInBits / 8; + unsigned NumLanes = VectorSizeInBits / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l < NumElts; l += NumLaneElts) + for (unsigned i = 0; i < NumLaneElts; ++i) { + int M = SM_SentinelZero; + if (i >= Imm) M = i - Imm + l; + ShuffleMask.push_back(M); + } +} + +void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + unsigned VectorSizeInBits = VT.getSizeInBits(); + unsigned NumElts = VectorSizeInBits / 8; + unsigned NumLanes = VectorSizeInBits / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l < NumElts; l += NumLaneElts) + for (unsigned i = 0; i < NumLaneElts; ++i) { + unsigned Base = i + Imm; + int M = Base + l; + if (Base >= NumLaneElts) M = SM_SentinelZero; + ShuffleMask.push_back(M); + } +} + +void DecodePALIGNRMask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned Offset = Imm * (VT.getVectorElementType().getSizeInBits() / 8); + + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0; i != NumLaneElts; ++i) { + unsigned Base = i + Offset; + // if i+offset is out of this lane then we actually need the other source + if (Base >= NumLaneElts) Base += NumElts - NumLaneElts; + ShuffleMask.push_back(Base + l); + } + } +} + +/// DecodePSHUFMask - This decodes the shuffle masks for pshufw, pshufd, and vpermilp*. +/// VT indicates the type of the vector allowing it to handle different +/// datatypes and vector widths. +void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + unsigned NumLanes = VT.getSizeInBits() / 128; + if (NumLanes == 0) NumLanes = 1; // Handle MMX + unsigned NumLaneElts = NumElts / NumLanes; + + unsigned NewImm = Imm; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0; i != NumLaneElts; ++i) { + ShuffleMask.push_back(NewImm % NumLaneElts + l); + NewImm /= NumLaneElts; + } + if (NumLaneElts == 4) NewImm = Imm; // reload imm + } +} + +void DecodePSHUFHWMask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + for (unsigned l = 0; l != NumElts; l += 8) { + unsigned NewImm = Imm; + for (unsigned i = 0, e = 4; i != e; ++i) { + ShuffleMask.push_back(l + i); + } + for (unsigned i = 4, e = 8; i != e; ++i) { + ShuffleMask.push_back(l + 4 + (NewImm & 3)); + NewImm >>= 2; + } + } +} + +void DecodePSHUFLWMask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + for (unsigned l = 0; l != NumElts; l += 8) { + unsigned NewImm = Imm; + for (unsigned i = 0, e = 4; i != e; ++i) { + ShuffleMask.push_back(l + (NewImm & 3)); + NewImm >>= 2; + } + for (unsigned i = 4, e = 8; i != e; ++i) { + ShuffleMask.push_back(l + i); + } + } +} + +void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumHalfElts = NumElts / 2; + + for (unsigned l = 0; l != NumHalfElts; ++l) + ShuffleMask.push_back(l + NumHalfElts); + for (unsigned h = 0; h != NumHalfElts; ++h) + ShuffleMask.push_back(h); +} + +/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates +/// the type of the vector allowing it to handle different datatypes and vector +/// widths. +void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + unsigned NewImm = Imm; + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + // each half of a lane comes from different source + for (unsigned s = 0; s != NumElts * 2; s += NumElts) { + for (unsigned i = 0; i != NumLaneElts / 2; ++i) { + ShuffleMask.push_back(NewImm % NumLaneElts + s + l); + NewImm /= NumLaneElts; + } + } + if (NumLaneElts == 4) NewImm = Imm; // reload imm + } +} + +/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd +/// and punpckh*. VT indicates the type of the vector allowing it to handle +/// different datatypes and vector widths. +void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate + // independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits() / 128; + if (NumLanes == 0) NumLanes = 1; // Handle MMX + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = l + NumLaneElts / 2, e = l + NumLaneElts; i != e; ++i) { + ShuffleMask.push_back(i); // Reads from dest/src1 + ShuffleMask.push_back(i + NumElts); // Reads from src/src2 + } + } +} + +/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd +/// and punpckl*. VT indicates the type of the vector allowing it to handle +/// different datatypes and vector widths. +void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate + // independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits() / 128; + if (NumLanes == 0 ) NumLanes = 1; // Handle MMX + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = l, e = l + NumLaneElts / 2; i != e; ++i) { + ShuffleMask.push_back(i); // Reads from dest/src1 + ShuffleMask.push_back(i + NumElts); // Reads from src/src2 + } + } +} + +/// \brief Decode a shuffle packed values at 128-bit granularity +/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) +/// immediate mask into a shuffle mask. +void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask) { + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumElementsInLane = 128 / VT.getScalarSizeInBits(); + unsigned ControlBitsMask = NumLanes - 1; + unsigned NumControlBits = NumLanes / 2; + + for (unsigned l = 0; l != NumLanes; ++l) { + unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask; + // We actually need the other source. + if (l >= NumLanes / 2) + LaneMask += NumLanes; + for (unsigned i = 0; i != NumElementsInLane; ++i) + ShuffleMask.push_back(LaneMask * NumElementsInLane + i); + } +} + +void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask) { + unsigned HalfSize = VT.getVectorNumElements() / 2; + + for (unsigned l = 0; l != 2; ++l) { + unsigned HalfMask = Imm >> (l * 4); + unsigned HalfBegin = (HalfMask & 0x3) * HalfSize; + for (unsigned i = HalfBegin, e = HalfBegin + HalfSize; i != e; ++i) + ShuffleMask.push_back(HalfMask & 8 ? SM_SentinelZero : (int)i); + } +} + +void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask) { + for (int i = 0, e = RawMask.size(); i < e; ++i) { + uint64_t M = RawMask[i]; + if (M == (uint64_t)SM_SentinelUndef) { + ShuffleMask.push_back(M); + continue; + } + // For AVX vectors with 32 bytes the base of the shuffle is the half of + // the vector we're inside. + int Base = i < 16 ? 0 : 16; + // If the high bit (7) of the byte is set, the element is zeroed. + if (M & (1 << 7)) + ShuffleMask.push_back(SM_SentinelZero); + else { + // Only the least significant 4 bits of the byte are used. + int Index = Base + (M & 0xf); + ShuffleMask.push_back(Index); + } + } +} + +void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + int ElementBits = VT.getScalarSizeInBits(); + int NumElements = VT.getVectorNumElements(); + for (int i = 0; i < NumElements; ++i) { + // If there are more than 8 elements in the vector, then any immediate blend + // mask applies to each 128-bit lane. There can never be more than + // 8 elements in a 128-bit lane with an immediate blend. + int Bit = NumElements > 8 ? i % (128 / ElementBits) : i; + assert(Bit < 8 && + "Immediate blends only operate over 8 elements at a time!"); + ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i); + } +} + +/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. +/// No VT provided since it only works on 256-bit, 4 element vectors. +void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + for (unsigned i = 0; i != 4; ++i) { + ShuffleMask.push_back((Imm >> (2 * i)) & 3); + } +} + +void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, SmallVectorImpl<int> &Mask) { + unsigned NumDstElts = DstVT.getVectorNumElements(); + unsigned SrcScalarBits = SrcVT.getScalarSizeInBits(); + unsigned DstScalarBits = DstVT.getScalarSizeInBits(); + unsigned Scale = DstScalarBits / SrcScalarBits; + assert(SrcScalarBits < DstScalarBits && + "Expected zero extension mask to increase scalar size"); + assert(SrcVT.getVectorNumElements() >= NumDstElts && + "Too many zero extension lanes"); + + for (unsigned i = 0; i != NumDstElts; i++) { + Mask.push_back(i); + for (unsigned j = 1; j != Scale; j++) + Mask.push_back(SM_SentinelZero); + } +} + +void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + ShuffleMask.push_back(0); + for (unsigned i = 1; i < NumElts; i++) + ShuffleMask.push_back(SM_SentinelZero); +} + +void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) { + // First element comes from the first element of second source. + // Remaining elements: Load zero extends / Move copies from first source. + unsigned NumElts = VT.getVectorNumElements(); + Mask.push_back(NumElts); + for (unsigned i = 1; i < NumElts; i++) + Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i); +} + +void DecodeEXTRQIMask(int Len, int Idx, + SmallVectorImpl<int> &ShuffleMask) { + // Only the bottom 6 bits are valid for each immediate. + Len &= 0x3F; + Idx &= 0x3F; + + // We can only decode this bit extraction instruction as a shuffle if both the + // length and index work with whole bytes. + if (0 != (Len % 8) || 0 != (Idx % 8)) + return; + + // A length of zero is equivalent to a bit length of 64. + if (Len == 0) + Len = 64; + + // If the length + index exceeds the bottom 64 bits the result is undefined. + if ((Len + Idx) > 64) { + ShuffleMask.append(16, SM_SentinelUndef); + return; + } + + // Convert index and index to work with bytes. + Len /= 8; + Idx /= 8; + + // EXTRQ: Extract Len bytes starting from Idx. Zero pad the remaining bytes + // of the lower 64-bits. The upper 64-bits are undefined. + for (int i = 0; i != Len; ++i) + ShuffleMask.push_back(i + Idx); + for (int i = Len; i != 8; ++i) + ShuffleMask.push_back(SM_SentinelZero); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(SM_SentinelUndef); +} + +void DecodeINSERTQIMask(int Len, int Idx, + SmallVectorImpl<int> &ShuffleMask) { + // Only the bottom 6 bits are valid for each immediate. + Len &= 0x3F; + Idx &= 0x3F; + + // We can only decode this bit insertion instruction as a shuffle if both the + // length and index work with whole bytes. + if (0 != (Len % 8) || 0 != (Idx % 8)) + return; + + // A length of zero is equivalent to a bit length of 64. + if (Len == 0) + Len = 64; + + // If the length + index exceeds the bottom 64 bits the result is undefined. + if ((Len + Idx) > 64) { + ShuffleMask.append(16, SM_SentinelUndef); + return; + } + + // Convert index and index to work with bytes. + Len /= 8; + Idx /= 8; + + // INSERTQ: Extract lowest Len bytes from lower half of second source and + // insert over first source starting at Idx byte. The upper 64-bits are + // undefined. + for (int i = 0; i != Idx; ++i) + ShuffleMask.push_back(i); + for (int i = 0; i != Len; ++i) + ShuffleMask.push_back(i + 16); + for (int i = Idx + Len; i != 8; ++i) + ShuffleMask.push_back(i); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(SM_SentinelUndef); +} + +void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask) { + for (int i = 0, e = RawMask.size(); i < e; ++i) { + uint64_t M = RawMask[i]; + ShuffleMask.push_back((int)M); + } +} + +void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask) { + for (int i = 0, e = RawMask.size(); i < e; ++i) { + uint64_t M = RawMask[i]; + ShuffleMask.push_back((int)M); + } +} + +} // llvm namespace diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h new file mode 100644 index 0000000..72db6a8 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -0,0 +1,122 @@ +//===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Define several functions to decode x86 specific shuffle semantics into a +// generic vector mask. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H +#define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/ArrayRef.h" + +//===----------------------------------------------------------------------===// +// Vector Mask Decoding +//===----------------------------------------------------------------------===// + +namespace llvm { +class MVT; + +enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 }; + +void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + +// <3,1> or <6,7,2,3> +void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask); + +// <0,2> or <0,1,4,5> +void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask); + +void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); + +void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); + +void DecodeMOVDDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); + +void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + +void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + +void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + +void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + +void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + +void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decodes a PSWAPD 3DNow! instruction. +void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); + +/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates +/// the type of the vector allowing it to handle different datatypes and vector +/// widths. +void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + +/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd +/// and punpckh*. VT indicates the type of the vector allowing it to handle +/// different datatypes and vector widths. +void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); + +/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd +/// and punpckl*. VT indicates the type of the vector allowing it to handle +/// different datatypes and vector widths. +void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a PSHUFB mask from a raw array of constants such as from +/// BUILD_VECTOR. +void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a BLEND immediate mask into a shuffle mask. +void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + +void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a shuffle packed values at 128-bit granularity +/// immediate mask into a shuffle mask. +void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask); + +/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. +/// No VT provided since it only works on 256-bit, 4 element vectors. +void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a zero extension instruction as a shuffle mask. +void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a move lower and zero upper instruction as a shuffle mask. +void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a scalar float move instruction as a shuffle mask. +void DecodeScalarMoveMask(MVT VT, bool IsLoad, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a SSE4A EXTRQ instruction as a v16i8 shuffle mask. +void DecodeEXTRQIMask(int Len, int Idx, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask. +void DecodeINSERTQIMask(int Len, int Idx, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants. +void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants. +void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask); +} // llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h new file mode 100644 index 0000000..fbec662 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86.h @@ -0,0 +1,76 @@ +//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the x86 +// target library, as used by the LLVM JIT. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86_H +#define LLVM_LIB_TARGET_X86_X86_H + +#include "llvm/Support/CodeGen.h" + +namespace llvm { + +class FunctionPass; +class ImmutablePass; +class X86TargetMachine; + +/// This pass converts a legalized DAG into a X86-specific DAG, ready for +/// instruction scheduling. +FunctionPass *createX86ISelDag(X86TargetMachine &TM, + CodeGenOpt::Level OptLevel); + +/// This pass initializes a global base register for PIC on x86-32. +FunctionPass* createX86GlobalBaseRegPass(); + +/// This pass combines multiple accesses to local-dynamic TLS variables so that +/// the TLS base address for the module is only fetched once per execution path +/// through the function. +FunctionPass *createCleanupLocalDynamicTLSPass(); + +/// This function returns a pass which converts floating-point register +/// references and pseudo instructions into floating-point stack references and +/// physical instructions. +FunctionPass *createX86FloatingPointStackifierPass(); + +/// This pass inserts AVX vzeroupper instructions before each call to avoid +/// transition penalty between functions encoded with AVX and SSE. +FunctionPass *createX86IssueVZeroUpperPass(); + +/// Return a pass that pads short functions with NOOPs. +/// This will prevent a stall when returning on the Atom. +FunctionPass *createX86PadShortFunctions(); + +/// Return a a pass that selectively replaces certain instructions (like add, +/// sub, inc, dec, some shifts, and some multiplies) by equivalent LEA +/// instructions, in order to eliminate execution delays in some processors. +FunctionPass *createX86FixupLEAs(); + +/// Return a pass that removes redundant address recalculations. +FunctionPass *createX86OptimizeLEAs(); + +/// Return a pass that optimizes the code-size of x86 call sequences. This is +/// done by replacing esp-relative movs with pushes. +FunctionPass *createX86CallFrameOptimization(); + +/// Return an IR pass that inserts EH registration stack objects and explicit +/// EH state updates. This pass must run after EH preparation, which does +/// Windows-specific but architecture-neutral preparation. +FunctionPass *createX86WinEHStatePass(); + +/// Return a Machine IR pass that expands X86-specific pseudo +/// instructions into a sequence of actual instructions. This pass +/// must run after prologue/epilogue insertion and before lowering +/// the MachineInstr to MC. +FunctionPass *createX86ExpandPseudoPass(); +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td new file mode 100644 index 0000000..8902a85 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86.td @@ -0,0 +1,787 @@ +//===-- X86.td - Target definition file for the Intel X86 --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a target description file for the Intel i386 architecture, referred +// to here as the "X86" architecture. +// +//===----------------------------------------------------------------------===// + +// Get the target-independent interfaces which we are implementing... +// +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// X86 Subtarget state +// + +def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true", + "64-bit mode (x86_64)">; +def Mode32Bit : SubtargetFeature<"32bit-mode", "In32BitMode", "true", + "32-bit mode (80386)">; +def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true", + "16-bit mode (i8086)">; + +//===----------------------------------------------------------------------===// +// X86 Subtarget features +//===----------------------------------------------------------------------===// + +def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true", + "Enable conditional move instructions">; + +def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true", + "Support POPCNT instruction">; + +def FeatureFXSR : SubtargetFeature<"fxsr", "HasFXSR", "true", + "Support fxsave/fxrestore instructions">; + +def FeatureXSAVE : SubtargetFeature<"xsave", "HasXSAVE", "true", + "Support xsave instructions">; + +def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true", + "Support xsaveopt instructions">; + +def FeatureXSAVEC : SubtargetFeature<"xsavec", "HasXSAVEC", "true", + "Support xsavec instructions">; + +def FeatureXSAVES : SubtargetFeature<"xsaves", "HasXSAVES", "true", + "Support xsaves instructions">; + +def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1", + "Enable SSE instructions", + // SSE codegen depends on cmovs, and all + // SSE1+ processors support them. + [FeatureCMOV]>; +def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2", + "Enable SSE2 instructions", + [FeatureSSE1]>; +def FeatureSSE3 : SubtargetFeature<"sse3", "X86SSELevel", "SSE3", + "Enable SSE3 instructions", + [FeatureSSE2]>; +def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3", + "Enable SSSE3 instructions", + [FeatureSSE3]>; +def FeatureSSE41 : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41", + "Enable SSE 4.1 instructions", + [FeatureSSSE3]>; +def FeatureSSE42 : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42", + "Enable SSE 4.2 instructions", + [FeatureSSE41]>; +// The MMX subtarget feature is separate from the rest of the SSE features +// because it's important (for odd compatibility reasons) to be able to +// turn it off explicitly while allowing SSE+ to be on. +def FeatureMMX : SubtargetFeature<"mmx","X863DNowLevel", "MMX", + "Enable MMX instructions">; +def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow", + "Enable 3DNow! instructions", + [FeatureMMX]>; +def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA", + "Enable 3DNow! Athlon instructions", + [Feature3DNow]>; +// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied +// feature, because SSE2 can be disabled (e.g. for compiling OS kernels) +// without disabling 64-bit mode. +def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true", + "Support 64-bit instructions", + [FeatureCMOV]>; +def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true", + "64-bit with cmpxchg16b", + [Feature64Bit]>; +def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true", + "Bit testing of memory is slow">; +def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true", + "SHLD instruction is slow">; +// FIXME: This should not apply to CPUs that do not have SSE. +def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16", + "IsUAMem16Slow", "true", + "Slow unaligned 16-byte memory access">; +def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32", + "IsUAMem32Slow", "true", + "Slow unaligned 32-byte memory access">; +def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true", + "Support SSE 4a instructions", + [FeatureSSE3]>; + +def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX", + "Enable AVX instructions", + [FeatureSSE42]>; +def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2", + "Enable AVX2 instructions", + [FeatureAVX]>; +def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F", + "Enable AVX-512 instructions", + [FeatureAVX2]>; +def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true", + "Enable AVX-512 Exponential and Reciprocal Instructions", + [FeatureAVX512]>; +def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true", + "Enable AVX-512 Conflict Detection Instructions", + [FeatureAVX512]>; +def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true", + "Enable AVX-512 PreFetch Instructions", + [FeatureAVX512]>; +def FeatureDQI : SubtargetFeature<"avx512dq", "HasDQI", "true", + "Enable AVX-512 Doubleword and Quadword Instructions", + [FeatureAVX512]>; +def FeatureBWI : SubtargetFeature<"avx512bw", "HasBWI", "true", + "Enable AVX-512 Byte and Word Instructions", + [FeatureAVX512]>; +def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true", + "Enable AVX-512 Vector Length eXtensions", + [FeatureAVX512]>; +def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true", + "Enable protection keys">; +def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", + "Enable packed carry-less multiplication instructions", + [FeatureSSE2]>; +def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true", + "Enable three-operand fused multiple-add", + [FeatureAVX]>; +def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true", + "Enable four-operand fused multiple-add", + [FeatureAVX, FeatureSSE4A]>; +def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true", + "Enable XOP instructions", + [FeatureFMA4]>; +def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem", + "HasSSEUnalignedMem", "true", + "Allow unaligned memory operands with SSE instructions">; +def FeatureAES : SubtargetFeature<"aes", "HasAES", "true", + "Enable AES instructions", + [FeatureSSE2]>; +def FeatureTBM : SubtargetFeature<"tbm", "HasTBM", "true", + "Enable TBM instructions">; +def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true", + "Support MOVBE instruction">; +def FeatureRDRAND : SubtargetFeature<"rdrnd", "HasRDRAND", "true", + "Support RDRAND instruction">; +def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true", + "Support 16-bit floating point conversion instructions", + [FeatureAVX]>; +def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true", + "Support FS/GS Base instructions">; +def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true", + "Support LZCNT instruction">; +def FeatureBMI : SubtargetFeature<"bmi", "HasBMI", "true", + "Support BMI instructions">; +def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true", + "Support BMI2 instructions">; +def FeatureRTM : SubtargetFeature<"rtm", "HasRTM", "true", + "Support RTM instructions">; +def FeatureHLE : SubtargetFeature<"hle", "HasHLE", "true", + "Support HLE">; +def FeatureADX : SubtargetFeature<"adx", "HasADX", "true", + "Support ADX instructions">; +def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true", + "Enable SHA instructions", + [FeatureSSE2]>; +def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true", + "Support PRFCHW instructions">; +def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true", + "Support RDSEED instruction">; +def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true", + "Support LAHF and SAHF instructions">; +def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true", + "Support MPX instructions">; +def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", + "Use LEA for adjusting the stack pointer">; +def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb", + "HasSlowDivide32", "true", + "Use 8-bit divide for positive values less than 256">; +def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw", + "HasSlowDivide64", "true", + "Use 16-bit divide for positive values less than 65536">; +def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions", + "PadShortFunctions", "true", + "Pad short functions">; +// TODO: This feature ought to be renamed. +// What it really refers to are CPUs for which certain instructions +// (which ones besides the example below?) are microcoded. +// The best examples of this are the memory forms of CALL and PUSH +// instructions, which should be avoided in favor of a MOV + register CALL/PUSH. +def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect", + "CallRegIndirect", "true", + "Call register indirect">; +def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true", + "LEA instruction needs inputs at AG stage">; +def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", + "LEA instruction with certain arguments is slow">; +def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", + "INC and DEC instructions are slower than ADD and SUB">; +def FeatureSoftFloat + : SubtargetFeature<"soft-float", "UseSoftFloat", "true", + "Use software floating point features.">; + +//===----------------------------------------------------------------------===// +// X86 processors supported. +//===----------------------------------------------------------------------===// + +include "X86Schedule.td" + +def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom", + "Intel Atom processors">; +def ProcIntelSLM : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM", + "Intel Silvermont processors">; + +class Proc<string Name, list<SubtargetFeature> Features> + : ProcessorModel<Name, GenericModel, Features>; + +def : Proc<"generic", [FeatureSlowUAMem16]>; +def : Proc<"i386", [FeatureSlowUAMem16]>; +def : Proc<"i486", [FeatureSlowUAMem16]>; +def : Proc<"i586", [FeatureSlowUAMem16]>; +def : Proc<"pentium", [FeatureSlowUAMem16]>; +def : Proc<"pentium-mmx", [FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"i686", [FeatureSlowUAMem16]>; +def : Proc<"pentiumpro", [FeatureSlowUAMem16, FeatureCMOV]>; +def : Proc<"pentium2", [FeatureSlowUAMem16, FeatureMMX, FeatureCMOV, + FeatureFXSR]>; +def : Proc<"pentium3", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, + FeatureFXSR]>; +def : Proc<"pentium3m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, + FeatureFXSR, FeatureSlowBTMem]>; +def : Proc<"pentium-m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2, + FeatureFXSR, FeatureSlowBTMem]>; +def : Proc<"pentium4", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2, + FeatureFXSR]>; +def : Proc<"pentium4m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2, + FeatureFXSR, FeatureSlowBTMem]>; + +// Intel Core Duo. +def : ProcessorModel<"yonah", SandyBridgeModel, + [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR, + FeatureSlowBTMem]>; + +// NetBurst. +def : Proc<"prescott", + [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR, + FeatureSlowBTMem]>; +def : Proc<"nocona", [ + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSE3, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem +]>; + +// Intel Core 2 Solo/Duo. +def : ProcessorModel<"core2", SandyBridgeModel, [ + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSSE3, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureLAHFSAHF +]>; +def : ProcessorModel<"penryn", SandyBridgeModel, [ + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSE41, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureLAHFSAHF +]>; + +// Atom CPUs. +class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [ + ProcIntelAtom, + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSSE3, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeatureSlowBTMem, + FeatureLEAForSP, + FeatureSlowDivide32, + FeatureSlowDivide64, + FeatureCallRegIndirect, + FeatureLEAUsesAG, + FeaturePadShortFunctions, + FeatureLAHFSAHF +]>; +def : BonnellProc<"bonnell">; +def : BonnellProc<"atom">; // Pin the generic name to the baseline. + +class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [ + ProcIntelSLM, + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeaturePOPCNT, + FeaturePCLMUL, + FeatureAES, + FeatureSlowDivide64, + FeatureCallRegIndirect, + FeaturePRFCHW, + FeatureSlowLEA, + FeatureSlowIncDec, + FeatureSlowBTMem, + FeatureLAHFSAHF +]>; +def : SilvermontProc<"silvermont">; +def : SilvermontProc<"slm">; // Legacy alias. + +// "Arrandale" along with corei3 and corei5 +class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureLAHFSAHF +]>; +def : NehalemProc<"nehalem">; +def : NehalemProc<"corei7">; + +// Westmere is a similar machine to nehalem with some additional features. +// Westmere is the corei3/i5/i7 path from nehalem to sandybridge +class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureLAHFSAHF +]>; +def : WestmereProc<"westmere">; + +// SSE is not listed here since llvm treats AVX as a reimplementation of SSE, +// rather than a superset. +class SandyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureSlowUAMem32, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureLAHFSAHF +]>; +def : SandyBridgeProc<"sandybridge">; +def : SandyBridgeProc<"corei7-avx">; // Legacy alias. + +class IvyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureSlowUAMem32, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureLAHFSAHF +]>; +def : IvyBridgeProc<"ivybridge">; +def : IvyBridgeProc<"core-avx-i">; // Legacy alias. + +class HaswellProc<string Name> : ProcessorModel<Name, HaswellModel, [ + FeatureMMX, + FeatureAVX2, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureRDRAND, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureSlowIncDec, + FeatureLAHFSAHF +]>; +def : HaswellProc<"haswell">; +def : HaswellProc<"core-avx2">; // Legacy alias. + +class BroadwellProc<string Name> : ProcessorModel<Name, HaswellModel, [ + FeatureMMX, + FeatureAVX2, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureADX, + FeatureRDSEED, + FeatureSlowIncDec, + FeatureLAHFSAHF +]>; +def : BroadwellProc<"broadwell">; + +// FIXME: define KNL model +class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel, [ + FeatureMMX, + FeatureAVX512, + FeatureFXSR, + FeatureERI, + FeatureCDI, + FeaturePFI, + FeatureCMPXCHG16B, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureSlowIncDec, + FeatureMPX, + FeatureLAHFSAHF +]>; +def : KnightsLandingProc<"knl">; + +// FIXME: define SKX model +class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel, [ + FeatureMMX, + FeatureAVX512, + FeatureFXSR, + FeatureCDI, + FeatureDQI, + FeatureBWI, + FeatureVLX, + FeaturePKU, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureADX, + FeatureRDSEED, + FeatureSlowIncDec, + FeatureMPX, + FeatureXSAVEC, + FeatureXSAVES, + FeatureLAHFSAHF +]>; +def : SkylakeProc<"skylake">; +def : SkylakeProc<"skx">; // Legacy alias. + + +// AMD CPUs. + +def : Proc<"k6", [FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"k6-2", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"k6-3", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"athlon", [FeatureSlowUAMem16, Feature3DNowA, + FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"athlon-tbird", [FeatureSlowUAMem16, Feature3DNowA, + FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"athlon-4", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA, + FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"athlon-xp", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA, + FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"athlon-mp", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA, + FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"k8", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, + FeatureFXSR, Feature64Bit, FeatureSlowBTMem, + FeatureSlowSHLD]>; +def : Proc<"opteron", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, + FeatureFXSR, Feature64Bit, FeatureSlowBTMem, + FeatureSlowSHLD]>; +def : Proc<"athlon64", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, + FeatureFXSR, Feature64Bit, FeatureSlowBTMem, + FeatureSlowSHLD]>; +def : Proc<"athlon-fx", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, + FeatureFXSR, Feature64Bit, FeatureSlowBTMem, + FeatureSlowSHLD]>; +def : Proc<"k8-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, + FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem, + FeatureSlowSHLD]>; +def : Proc<"opteron-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, + FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem, + FeatureSlowSHLD]>; +def : Proc<"athlon64-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, + FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem, + FeatureSlowSHLD]>; +def : Proc<"amdfam10", [FeatureSSE4A, Feature3DNowA, FeatureFXSR, + FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT, + FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>; +def : Proc<"barcelona", [FeatureSSE4A, Feature3DNowA, FeatureFXSR, + FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT, + FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>; + +// Bobcat +def : Proc<"btver1", [ + FeatureMMX, + FeatureSSSE3, + FeatureSSE4A, + FeatureFXSR, + FeatureCMPXCHG16B, + FeaturePRFCHW, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureSlowSHLD, + FeatureLAHFSAHF +]>; + +// Jaguar +def : ProcessorModel<"btver2", BtVer2Model, [ + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureSSE4A, + FeatureCMPXCHG16B, + FeaturePRFCHW, + FeatureAES, + FeaturePCLMUL, + FeatureBMI, + FeatureF16C, + FeatureMOVBE, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureSlowSHLD, + FeatureLAHFSAHF +]>; + +// Bulldozer +def : Proc<"bdver1", [ + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureSSE4A, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureSlowSHLD, + FeatureLAHFSAHF +]>; +// Piledriver +def : Proc<"bdver2", [ + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureSSE4A, + FeatureF16C, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureBMI, + FeatureTBM, + FeatureFMA, + FeatureSlowSHLD, + FeatureLAHFSAHF +]>; + +// Steamroller +def : Proc<"bdver3", [ + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureSSE4A, + FeatureF16C, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureBMI, + FeatureTBM, + FeatureFMA, + FeatureXSAVEOPT, + FeatureSlowSHLD, + FeatureFSGSBase, + FeatureLAHFSAHF +]>; + +// Excavator +def : Proc<"bdver4", [ + FeatureMMX, + FeatureAVX2, + FeatureFXSR, + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureF16C, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureBMI, + FeatureBMI2, + FeatureTBM, + FeatureFMA, + FeatureXSAVEOPT, + FeatureFSGSBase, + FeatureLAHFSAHF +]>; + +def : Proc<"geode", [FeatureSlowUAMem16, Feature3DNowA]>; + +def : Proc<"winchip-c6", [FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"winchip2", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"c3", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"c3-2", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, FeatureFXSR]>; + +// We also provide a generic 64-bit specific x86 processor model which tries to +// be good for modern chips without enabling instruction set encodings past the +// basic SSE2 and 64-bit ones. It disables slow things from any mainstream and +// modern 64-bit x86 chip, and enables features that are generally beneficial. +// +// We currently use the Sandy Bridge model as the default scheduling model as +// we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which +// covers a huge swath of x86 processors. If there are specific scheduling +// knobs which need to be tuned differently for AMD chips, we might consider +// forming a common base for them. +def : ProcessorModel<"x86-64", SandyBridgeModel, + [FeatureMMX, FeatureSSE2, FeatureFXSR, Feature64Bit, + FeatureSlowBTMem ]>; + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "X86RegisterInfo.td" + +//===----------------------------------------------------------------------===// +// Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "X86InstrInfo.td" + +def X86InstrInfo : InstrInfo; + +//===----------------------------------------------------------------------===// +// Calling Conventions +//===----------------------------------------------------------------------===// + +include "X86CallingConv.td" + + +//===----------------------------------------------------------------------===// +// Assembly Parser +//===----------------------------------------------------------------------===// + +def ATTAsmParserVariant : AsmParserVariant { + int Variant = 0; + + // Variant name. + string Name = "att"; + + // Discard comments in assembly strings. + string CommentDelimiter = "#"; + + // Recognize hard coded registers. + string RegisterPrefix = "%"; +} + +def IntelAsmParserVariant : AsmParserVariant { + int Variant = 1; + + // Variant name. + string Name = "intel"; + + // Discard comments in assembly strings. + string CommentDelimiter = ";"; + + // Recognize hard coded registers. + string RegisterPrefix = ""; +} + +//===----------------------------------------------------------------------===// +// Assembly Printers +//===----------------------------------------------------------------------===// + +// The X86 target supports two different syntaxes for emitting machine code. +// This is controlled by the -x86-asm-syntax={att|intel} +def ATTAsmWriter : AsmWriter { + string AsmWriterClassName = "ATTInstPrinter"; + int Variant = 0; +} +def IntelAsmWriter : AsmWriter { + string AsmWriterClassName = "IntelInstPrinter"; + int Variant = 1; +} + +def X86 : Target { + // Information about the instructions... + let InstructionSet = X86InstrInfo; + let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant]; + let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter]; +} diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp new file mode 100644 index 0000000..2170e62 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -0,0 +1,706 @@ +//===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to X86 machine code. +// +//===----------------------------------------------------------------------===// + +#include "X86AsmPrinter.h" +#include "InstPrinter/X86ATTInstPrinter.h" +#include "MCTargetDesc/X86BaseInfo.h" +#include "X86InstrInfo.h" +#include "X86MachineFunctionInfo.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Mangler.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSectionCOFF.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/COFF.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Primitive Helper Functions. +//===----------------------------------------------------------------------===// + +/// runOnMachineFunction - Emit the function body. +/// +bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) { + Subtarget = &MF.getSubtarget<X86Subtarget>(); + + SMShadowTracker.startFunction(MF); + + SetupMachineFunction(MF); + + if (Subtarget->isTargetCOFF()) { + bool Intrn = MF.getFunction()->hasInternalLinkage(); + OutStreamer->BeginCOFFSymbolDef(CurrentFnSym); + OutStreamer->EmitCOFFSymbolStorageClass(Intrn ? COFF::IMAGE_SYM_CLASS_STATIC + : COFF::IMAGE_SYM_CLASS_EXTERNAL); + OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION + << COFF::SCT_COMPLEX_TYPE_SHIFT); + OutStreamer->EndCOFFSymbolDef(); + } + + // Emit the rest of the function body. + EmitFunctionBody(); + + // We didn't modify anything. + return false; +} + +/// printSymbolOperand - Print a raw symbol reference operand. This handles +/// jump tables, constant pools, global address and external symbols, all of +/// which print to a label with various suffixes for relocation types etc. +static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO, + raw_ostream &O) { + switch (MO.getType()) { + default: llvm_unreachable("unknown symbol type!"); + case MachineOperand::MO_ConstantPoolIndex: + P.GetCPISymbol(MO.getIndex())->print(O, P.MAI); + P.printOffset(MO.getOffset(), O); + break; + case MachineOperand::MO_GlobalAddress: { + const GlobalValue *GV = MO.getGlobal(); + + MCSymbol *GVSym; + if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) + GVSym = P.getSymbolWithGlobalValueBase(GV, "$stub"); + else if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY || + MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE || + MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE) + GVSym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + else + GVSym = P.getSymbol(GV); + + // Handle dllimport linkage. + if (MO.getTargetFlags() == X86II::MO_DLLIMPORT) + GVSym = + P.OutContext.getOrCreateSymbol(Twine("__imp_") + GVSym->getName()); + + if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY || + MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) { + MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + MachineModuleInfoImpl::StubValueTy &StubSym = + P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym); + if (!StubSym.getPointer()) + StubSym = MachineModuleInfoImpl:: + StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage()); + } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){ + MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + MachineModuleInfoImpl::StubValueTy &StubSym = + P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry( + Sym); + if (!StubSym.getPointer()) + StubSym = MachineModuleInfoImpl:: + StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage()); + } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) { + MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$stub"); + MachineModuleInfoImpl::StubValueTy &StubSym = + P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym); + if (!StubSym.getPointer()) + StubSym = MachineModuleInfoImpl:: + StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage()); + } + + // If the name begins with a dollar-sign, enclose it in parens. We do this + // to avoid having it look like an integer immediate to the assembler. + if (GVSym->getName()[0] != '$') + GVSym->print(O, P.MAI); + else { + O << '('; + GVSym->print(O, P.MAI); + O << ')'; + } + P.printOffset(MO.getOffset(), O); + break; + } + } + + switch (MO.getTargetFlags()) { + default: + llvm_unreachable("Unknown target flag on GV operand"); + case X86II::MO_NO_FLAG: // No flag. + break; + case X86II::MO_DARWIN_NONLAZY: + case X86II::MO_DLLIMPORT: + case X86II::MO_DARWIN_STUB: + // These affect the name of the symbol, not any suffix. + break; + case X86II::MO_GOT_ABSOLUTE_ADDRESS: + O << " + [.-"; + P.MF->getPICBaseSymbol()->print(O, P.MAI); + O << ']'; + break; + case X86II::MO_PIC_BASE_OFFSET: + case X86II::MO_DARWIN_NONLAZY_PIC_BASE: + case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: + O << '-'; + P.MF->getPICBaseSymbol()->print(O, P.MAI); + break; + case X86II::MO_TLSGD: O << "@TLSGD"; break; + case X86II::MO_TLSLD: O << "@TLSLD"; break; + case X86II::MO_TLSLDM: O << "@TLSLDM"; break; + case X86II::MO_GOTTPOFF: O << "@GOTTPOFF"; break; + case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break; + case X86II::MO_TPOFF: O << "@TPOFF"; break; + case X86II::MO_DTPOFF: O << "@DTPOFF"; break; + case X86II::MO_NTPOFF: O << "@NTPOFF"; break; + case X86II::MO_GOTNTPOFF: O << "@GOTNTPOFF"; break; + case X86II::MO_GOTPCREL: O << "@GOTPCREL"; break; + case X86II::MO_GOT: O << "@GOT"; break; + case X86II::MO_GOTOFF: O << "@GOTOFF"; break; + case X86II::MO_PLT: O << "@PLT"; break; + case X86II::MO_TLVP: O << "@TLVP"; break; + case X86II::MO_TLVP_PIC_BASE: + O << "@TLVP" << '-'; + P.MF->getPICBaseSymbol()->print(O, P.MAI); + break; + case X86II::MO_SECREL: O << "@SECREL32"; break; + } +} + +static void printOperand(X86AsmPrinter &P, const MachineInstr *MI, + unsigned OpNo, raw_ostream &O, + const char *Modifier = nullptr, unsigned AsmVariant = 0); + +/// printPCRelImm - This is used to print an immediate value that ends up +/// being encoded as a pc-relative value. These print slightly differently, for +/// example, a $ is not emitted. +static void printPCRelImm(X86AsmPrinter &P, const MachineInstr *MI, + unsigned OpNo, raw_ostream &O) { + const MachineOperand &MO = MI->getOperand(OpNo); + switch (MO.getType()) { + default: llvm_unreachable("Unknown pcrel immediate operand"); + case MachineOperand::MO_Register: + // pc-relativeness was handled when computing the value in the reg. + printOperand(P, MI, OpNo, O); + return; + case MachineOperand::MO_Immediate: + O << MO.getImm(); + return; + case MachineOperand::MO_GlobalAddress: + printSymbolOperand(P, MO, O); + return; + } +} + +static void printOperand(X86AsmPrinter &P, const MachineInstr *MI, + unsigned OpNo, raw_ostream &O, const char *Modifier, + unsigned AsmVariant) { + const MachineOperand &MO = MI->getOperand(OpNo); + switch (MO.getType()) { + default: llvm_unreachable("unknown operand type!"); + case MachineOperand::MO_Register: { + // FIXME: Enumerating AsmVariant, so we can remove magic number. + if (AsmVariant == 0) O << '%'; + unsigned Reg = MO.getReg(); + if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) { + unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 : + (strcmp(Modifier+6,"32") == 0) ? 32 : + (strcmp(Modifier+6,"16") == 0) ? 16 : 8; + Reg = getX86SubSuperRegister(Reg, Size); + } + O << X86ATTInstPrinter::getRegisterName(Reg); + return; + } + + case MachineOperand::MO_Immediate: + if (AsmVariant == 0) O << '$'; + O << MO.getImm(); + return; + + case MachineOperand::MO_GlobalAddress: { + if (AsmVariant == 0) O << '$'; + printSymbolOperand(P, MO, O); + break; + } + } +} + +static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI, + unsigned Op, raw_ostream &O, + const char *Modifier = nullptr) { + const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg); + const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg); + const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp); + + // If we really don't want to print out (rip), don't. + bool HasBaseReg = BaseReg.getReg() != 0; + if (HasBaseReg && Modifier && !strcmp(Modifier, "no-rip") && + BaseReg.getReg() == X86::RIP) + HasBaseReg = false; + + // HasParenPart - True if we will print out the () part of the mem ref. + bool HasParenPart = IndexReg.getReg() || HasBaseReg; + + switch (DispSpec.getType()) { + default: + llvm_unreachable("unknown operand type!"); + case MachineOperand::MO_Immediate: { + int DispVal = DispSpec.getImm(); + if (DispVal || !HasParenPart) + O << DispVal; + break; + } + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ConstantPoolIndex: + printSymbolOperand(P, DispSpec, O); + } + + if (Modifier && strcmp(Modifier, "H") == 0) + O << "+8"; + + if (HasParenPart) { + assert(IndexReg.getReg() != X86::ESP && + "X86 doesn't allow scaling by ESP"); + + O << '('; + if (HasBaseReg) + printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier); + + if (IndexReg.getReg()) { + O << ','; + printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier); + unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm(); + if (ScaleVal != 1) + O << ',' << ScaleVal; + } + O << ')'; + } +} + +static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI, + unsigned Op, raw_ostream &O, + const char *Modifier = nullptr) { + assert(isMem(MI, Op) && "Invalid memory reference!"); + const MachineOperand &Segment = MI->getOperand(Op+X86::AddrSegmentReg); + if (Segment.getReg()) { + printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier); + O << ':'; + } + printLeaMemReference(P, MI, Op, O, Modifier); +} + +static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI, + unsigned Op, raw_ostream &O, + const char *Modifier = nullptr, + unsigned AsmVariant = 1) { + const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg); + unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm(); + const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg); + const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp); + const MachineOperand &SegReg = MI->getOperand(Op+X86::AddrSegmentReg); + + // If this has a segment register, print it. + if (SegReg.getReg()) { + printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier, AsmVariant); + O << ':'; + } + + O << '['; + + bool NeedPlus = false; + if (BaseReg.getReg()) { + printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier, AsmVariant); + NeedPlus = true; + } + + if (IndexReg.getReg()) { + if (NeedPlus) O << " + "; + if (ScaleVal != 1) + O << ScaleVal << '*'; + printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier, AsmVariant); + NeedPlus = true; + } + + if (!DispSpec.isImm()) { + if (NeedPlus) O << " + "; + printOperand(P, MI, Op+X86::AddrDisp, O, Modifier, AsmVariant); + } else { + int64_t DispVal = DispSpec.getImm(); + if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) { + if (NeedPlus) { + if (DispVal > 0) + O << " + "; + else { + O << " - "; + DispVal = -DispVal; + } + } + O << DispVal; + } + } + O << ']'; +} + +static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO, + char Mode, raw_ostream &O) { + unsigned Reg = MO.getReg(); + switch (Mode) { + default: return true; // Unknown mode. + case 'b': // Print QImode register + Reg = getX86SubSuperRegister(Reg, 8); + break; + case 'h': // Print QImode high register + Reg = getX86SubSuperRegister(Reg, 8, true); + break; + case 'w': // Print HImode register + Reg = getX86SubSuperRegister(Reg, 16); + break; + case 'k': // Print SImode register + Reg = getX86SubSuperRegister(Reg, 32); + break; + case 'q': + // Print 64-bit register names if 64-bit integer registers are available. + // Otherwise, print 32-bit register names. + Reg = getX86SubSuperRegister(Reg, P.getSubtarget().is64Bit() ? 64 : 32); + break; + } + + O << '%' << X86ATTInstPrinter::getRegisterName(Reg); + return false; +} + +/// PrintAsmOperand - Print out an operand for an inline asm expression. +/// +bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode, raw_ostream &O) { + // Does this asm operand have a single letter operand modifier? + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + const MachineOperand &MO = MI->getOperand(OpNo); + + switch (ExtraCode[0]) { + default: + // See if this is a generic print operand + return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); + case 'a': // This is an address. Currently only 'i' and 'r' are expected. + switch (MO.getType()) { + default: + return true; + case MachineOperand::MO_Immediate: + O << MO.getImm(); + return false; + case MachineOperand::MO_ConstantPoolIndex: + case MachineOperand::MO_JumpTableIndex: + case MachineOperand::MO_ExternalSymbol: + llvm_unreachable("unexpected operand type!"); + case MachineOperand::MO_GlobalAddress: + printSymbolOperand(*this, MO, O); + if (Subtarget->isPICStyleRIPRel()) + O << "(%rip)"; + return false; + case MachineOperand::MO_Register: + O << '('; + printOperand(*this, MI, OpNo, O); + O << ')'; + return false; + } + + case 'c': // Don't print "$" before a global var name or constant. + switch (MO.getType()) { + default: + printOperand(*this, MI, OpNo, O); + break; + case MachineOperand::MO_Immediate: + O << MO.getImm(); + break; + case MachineOperand::MO_ConstantPoolIndex: + case MachineOperand::MO_JumpTableIndex: + case MachineOperand::MO_ExternalSymbol: + llvm_unreachable("unexpected operand type!"); + case MachineOperand::MO_GlobalAddress: + printSymbolOperand(*this, MO, O); + break; + } + return false; + + case 'A': // Print '*' before a register (it must be a register) + if (MO.isReg()) { + O << '*'; + printOperand(*this, MI, OpNo, O); + return false; + } + return true; + + case 'b': // Print QImode register + case 'h': // Print QImode high register + case 'w': // Print HImode register + case 'k': // Print SImode register + case 'q': // Print DImode register + if (MO.isReg()) + return printAsmMRegister(*this, MO, ExtraCode[0], O); + printOperand(*this, MI, OpNo, O); + return false; + + case 'P': // This is the operand of a call, treat specially. + printPCRelImm(*this, MI, OpNo, O); + return false; + + case 'n': // Negate the immediate or print a '-' before the operand. + // Note: this is a temporary solution. It should be handled target + // independently as part of the 'MC' work. + if (MO.isImm()) { + O << -MO.getImm(); + return false; + } + O << '-'; + } + } + + printOperand(*this, MI, OpNo, O, /*Modifier*/ nullptr, AsmVariant); + return false; +} + +bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, + unsigned OpNo, unsigned AsmVariant, + const char *ExtraCode, + raw_ostream &O) { + if (AsmVariant) { + printIntelMemReference(*this, MI, OpNo, O); + return false; + } + + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: return true; // Unknown modifier. + case 'b': // Print QImode register + case 'h': // Print QImode high register + case 'w': // Print HImode register + case 'k': // Print SImode register + case 'q': // Print SImode register + // These only apply to registers, ignore on mem. + break; + case 'H': + printMemReference(*this, MI, OpNo, O, "H"); + return false; + case 'P': // Don't print @PLT, but do print as memory. + printMemReference(*this, MI, OpNo, O, "no-rip"); + return false; + } + } + printMemReference(*this, MI, OpNo, O); + return false; +} + +void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { + const Triple &TT = TM.getTargetTriple(); + + if (TT.isOSBinFormatMachO()) + OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); + + if (TT.isOSBinFormatCOFF()) { + // Emit an absolute @feat.00 symbol. This appears to be some kind of + // compiler features bitfield read by link.exe. + if (TT.getArch() == Triple::x86) { + MCSymbol *S = MMI->getContext().getOrCreateSymbol(StringRef("@feat.00")); + OutStreamer->BeginCOFFSymbolDef(S); + OutStreamer->EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC); + OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL); + OutStreamer->EndCOFFSymbolDef(); + // According to the PE-COFF spec, the LSB of this value marks the object + // for "registered SEH". This means that all SEH handler entry points + // must be registered in .sxdata. Use of any unregistered handlers will + // cause the process to terminate immediately. LLVM does not know how to + // register any SEH handlers, so its object files should be safe. + OutStreamer->EmitSymbolAttribute(S, MCSA_Global); + OutStreamer->EmitAssignment( + S, MCConstantExpr::create(int64_t(1), MMI->getContext())); + } + } + OutStreamer->EmitSyntaxDirective(); +} + +static void +emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel, + MachineModuleInfoImpl::StubValueTy &MCSym) { + // L_foo$stub: + OutStreamer.EmitLabel(StubLabel); + // .indirect_symbol _foo + OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol); + + if (MCSym.getInt()) + // External to current translation unit. + OutStreamer.EmitIntValue(0, 4/*size*/); + else + // Internal to current translation unit. + // + // When we place the LSDA into the TEXT section, the type info + // pointers need to be indirect and pc-rel. We accomplish this by + // using NLPs; however, sometimes the types are local to the file. + // We need to fill in the value for the NLP in those cases. + OutStreamer.EmitValue( + MCSymbolRefExpr::create(MCSym.getPointer(), OutStreamer.getContext()), + 4 /*size*/); +} + +MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const { + if (Subtarget->isTargetKnownWindowsMSVC()) { + const MachineConstantPoolEntry &CPE = + MF->getConstantPool()->getConstants()[CPID]; + if (!CPE.isMachineConstantPoolEntry()) { + const DataLayout &DL = MF->getDataLayout(); + SectionKind Kind = CPE.getSectionKind(&DL); + const Constant *C = CPE.Val.ConstVal; + if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>( + getObjFileLowering().getSectionForConstant(DL, Kind, C))) { + if (MCSymbol *Sym = S->getCOMDATSymbol()) { + if (Sym->isUndefined()) + OutStreamer->EmitSymbolAttribute(Sym, MCSA_Global); + return Sym; + } + } + } + } + + return AsmPrinter::GetCPISymbol(CPID); +} + +void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { + const Triple &TT = TM.getTargetTriple(); + + if (TT.isOSBinFormatMachO()) { + // All darwin targets use mach-o. + MachineModuleInfoMachO &MMIMacho = + MMI->getObjFileInfo<MachineModuleInfoMachO>(); + + // Output stubs for dynamically-linked functions. + MachineModuleInfoMachO::SymbolListTy Stubs; + + Stubs = MMIMacho.GetFnStubList(); + if (!Stubs.empty()) { + MCSection *TheSection = OutContext.getMachOSection( + "__IMPORT", "__jump_table", + MachO::S_SYMBOL_STUBS | MachO::S_ATTR_SELF_MODIFYING_CODE | + MachO::S_ATTR_PURE_INSTRUCTIONS, + 5, SectionKind::getMetadata()); + OutStreamer->SwitchSection(TheSection); + + for (const auto &Stub : Stubs) { + // L_foo$stub: + OutStreamer->EmitLabel(Stub.first); + // .indirect_symbol _foo + OutStreamer->EmitSymbolAttribute(Stub.second.getPointer(), + MCSA_IndirectSymbol); + // hlt; hlt; hlt; hlt; hlt hlt = 0xf4. + const char HltInsts[] = "\xf4\xf4\xf4\xf4\xf4"; + OutStreamer->EmitBytes(StringRef(HltInsts, 5)); + } + + Stubs.clear(); + OutStreamer->AddBlankLine(); + } + + // Output stubs for external and common global variables. + Stubs = MMIMacho.GetGVStubList(); + if (!Stubs.empty()) { + MCSection *TheSection = OutContext.getMachOSection( + "__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS, + SectionKind::getMetadata()); + OutStreamer->SwitchSection(TheSection); + + for (auto &Stub : Stubs) + emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second); + + Stubs.clear(); + OutStreamer->AddBlankLine(); + } + + Stubs = MMIMacho.GetHiddenGVStubList(); + if (!Stubs.empty()) { + MCSection *TheSection = OutContext.getMachOSection( + "__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS, + SectionKind::getMetadata()); + OutStreamer->SwitchSection(TheSection); + + for (auto &Stub : Stubs) + emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second); + + Stubs.clear(); + OutStreamer->AddBlankLine(); + } + + SM.serializeToStackMapSection(); + FM.serializeToFaultMapSection(); + + // Funny Darwin hack: This flag tells the linker that no global symbols + // contain code that falls through to other global symbols (e.g. the obvious + // implementation of multiple entry points). If this doesn't occur, the + // linker can safely perform dead code stripping. Since LLVM never + // generates code that does this, it is always safe to set. + OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); + } + + if (TT.isKnownWindowsMSVCEnvironment() && MMI->usesVAFloatArgument()) { + StringRef SymbolName = + (TT.getArch() == Triple::x86_64) ? "_fltused" : "__fltused"; + MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName); + OutStreamer->EmitSymbolAttribute(S, MCSA_Global); + } + + if (TT.isOSBinFormatCOFF()) { + const TargetLoweringObjectFileCOFF &TLOFCOFF = + static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering()); + + std::string Flags; + raw_string_ostream FlagsOS(Flags); + + for (const auto &Function : M) + TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Function, *Mang); + for (const auto &Global : M.globals()) + TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Global, *Mang); + for (const auto &Alias : M.aliases()) + TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Alias, *Mang); + + FlagsOS.flush(); + + // Output collected flags. + if (!Flags.empty()) { + OutStreamer->SwitchSection(TLOFCOFF.getDrectveSection()); + OutStreamer->EmitBytes(Flags); + } + + SM.serializeToStackMapSection(); + } + + if (TT.isOSBinFormatELF()) { + SM.serializeToStackMapSection(); + FM.serializeToFaultMapSection(); + } +} + +//===----------------------------------------------------------------------===// +// Target Registry Stuff +//===----------------------------------------------------------------------===// + +// Force static initialization. +extern "C" void LLVMInitializeX86AsmPrinter() { + RegisterAsmPrinter<X86AsmPrinter> X(TheX86_32Target); + RegisterAsmPrinter<X86AsmPrinter> Y(TheX86_64Target); +} diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h new file mode 100644 index 0000000..9c8bd98 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h @@ -0,0 +1,131 @@ +//===-- X86AsmPrinter.h - X86 implementation of AsmPrinter ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86ASMPRINTER_H +#define LLVM_LIB_TARGET_X86_X86ASMPRINTER_H + +#include "X86Subtarget.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/FaultMaps.h" +#include "llvm/CodeGen/StackMaps.h" +#include "llvm/Target/TargetMachine.h" + +// Implemented in X86MCInstLower.cpp +namespace { + class X86MCInstLower; +} + +namespace llvm { +class MCStreamer; +class MCSymbol; + +class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { + const X86Subtarget *Subtarget; + StackMaps SM; + FaultMaps FM; + + // This utility class tracks the length of a stackmap instruction's 'shadow'. + // It is used by the X86AsmPrinter to ensure that the stackmap shadow + // invariants (i.e. no other stackmaps, patchpoints, or control flow within + // the shadow) are met, while outputting a minimal number of NOPs for padding. + // + // To minimise the number of NOPs used, the shadow tracker counts the number + // of instruction bytes output since the last stackmap. Only if there are too + // few instruction bytes to cover the shadow are NOPs used for padding. + class StackMapShadowTracker { + public: + StackMapShadowTracker(TargetMachine &TM); + ~StackMapShadowTracker(); + void startFunction(MachineFunction &MF); + void count(MCInst &Inst, const MCSubtargetInfo &STI); + + // Called to signal the start of a shadow of RequiredSize bytes. + void reset(unsigned RequiredSize) { + RequiredShadowSize = RequiredSize; + CurrentShadowSize = 0; + InShadow = true; + } + + // Called before every stackmap/patchpoint, and at the end of basic blocks, + // to emit any necessary padding-NOPs. + void emitShadowPadding(MCStreamer &OutStreamer, const MCSubtargetInfo &STI); + private: + TargetMachine &TM; + const MachineFunction *MF; + std::unique_ptr<MCCodeEmitter> CodeEmitter; + bool InShadow; + + // RequiredShadowSize holds the length of the shadow specified in the most + // recently encountered STACKMAP instruction. + // CurrentShadowSize counts the number of bytes encoded since the most + // recently encountered STACKMAP, stopping when that number is greater than + // or equal to RequiredShadowSize. + unsigned RequiredShadowSize, CurrentShadowSize; + }; + + StackMapShadowTracker SMShadowTracker; + + // All instructions emitted by the X86AsmPrinter should use this helper + // method. + // + // This helper function invokes the SMShadowTracker on each instruction before + // outputting it to the OutStream. This allows the shadow tracker to minimise + // the number of NOPs used for stackmap padding. + void EmitAndCountInstruction(MCInst &Inst); + void LowerSTACKMAP(const MachineInstr &MI); + void LowerPATCHPOINT(const MachineInstr &MI, X86MCInstLower &MCIL); + void LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL); + void LowerFAULTING_LOAD_OP(const MachineInstr &MI, X86MCInstLower &MCIL); + + void LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI); + + public: + explicit X86AsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer) + : AsmPrinter(TM, std::move(Streamer)), SM(*this), FM(*this), + SMShadowTracker(TM) {} + + const char *getPassName() const override { + return "X86 Assembly / Object Emitter"; + } + + const X86Subtarget &getSubtarget() const { return *Subtarget; } + + void EmitStartOfAsmFile(Module &M) override; + + void EmitEndOfAsmFile(Module &M) override; + + void EmitInstruction(const MachineInstr *MI) override; + + void EmitBasicBlockEnd(const MachineBasicBlock &MBB) override { + SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo()); + } + + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &OS) override; + bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &OS) override; + + /// \brief Return the symbol for the specified constant pool entry. + MCSymbol *GetCPISymbol(unsigned CPID) const override; + + bool doInitialization(Module &M) override { + SMShadowTracker.reset(0); + SM.reset(); + return AsmPrinter::doInitialization(M); + } + + bool runOnMachineFunction(MachineFunction &F) override; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp new file mode 100644 index 0000000..fc6ee17 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp @@ -0,0 +1,558 @@ +//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines a pass that optimizes call sequences on x86. +// Currently, it converts movs of function parameters onto the stack into +// pushes. This is beneficial for two main reasons: +// 1) The push instruction encoding is much smaller than an esp-relative mov +// 2) It is possible to push memory arguments directly. So, if the +// the transformation is preformed pre-reg-alloc, it can help relieve +// register pressure. +// +//===----------------------------------------------------------------------===// + +#include <algorithm> + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "X86MachineFunctionInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-cf-opt" + +static cl::opt<bool> + NoX86CFOpt("no-x86-call-frame-opt", + cl::desc("Avoid optimizing x86 call frames for size"), + cl::init(false), cl::Hidden); + +namespace { +class X86CallFrameOptimization : public MachineFunctionPass { +public: + X86CallFrameOptimization() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + // Information we know about a particular call site + struct CallContext { + CallContext() + : FrameSetup(nullptr), Call(nullptr), SPCopy(nullptr), ExpectedDist(0), + MovVector(4, nullptr), NoStackParams(false), UsePush(false){} + + // Iterator referring to the frame setup instruction + MachineBasicBlock::iterator FrameSetup; + + // Actual call instruction + MachineInstr *Call; + + // A copy of the stack pointer + MachineInstr *SPCopy; + + // The total displacement of all passed parameters + int64_t ExpectedDist; + + // The sequence of movs used to pass the parameters + SmallVector<MachineInstr *, 4> MovVector; + + // True if this call site has no stack parameters + bool NoStackParams; + + // True of this callsite can use push instructions + bool UsePush; + }; + + typedef SmallVector<CallContext, 8> ContextVector; + + bool isLegal(MachineFunction &MF); + + bool isProfitable(MachineFunction &MF, ContextVector &CallSeqMap); + + void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, CallContext &Context); + + bool adjustCallSequence(MachineFunction &MF, const CallContext &Context); + + MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup, + unsigned Reg); + + enum InstClassification { Convert, Skip, Exit }; + + InstClassification classifyInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const X86RegisterInfo &RegInfo, + DenseSet<unsigned int> &UsedRegs); + + const char *getPassName() const override { return "X86 Optimize Call Frame"; } + + const TargetInstrInfo *TII; + const X86FrameLowering *TFL; + const X86Subtarget *STI; + const MachineRegisterInfo *MRI; + static char ID; +}; + +char X86CallFrameOptimization::ID = 0; +} + +FunctionPass *llvm::createX86CallFrameOptimization() { + return new X86CallFrameOptimization(); +} + +// This checks whether the transformation is legal. +// Also returns false in cases where it's potentially legal, but +// we don't even want to try. +bool X86CallFrameOptimization::isLegal(MachineFunction &MF) { + if (NoX86CFOpt.getValue()) + return false; + + // We currently only support call sequences where *all* parameters. + // are passed on the stack. + // No point in running this in 64-bit mode, since some arguments are + // passed in-register in all common calling conventions, so the pattern + // we're looking for will never match. + if (STI->is64Bit()) + return false; + + // We can't encode multiple DW_CFA_GNU_args_size or DW_CFA_def_cfa_offset + // in the compact unwind encoding that Darwin uses. So, bail if there + // is a danger of that being generated. + if (STI->isTargetDarwin() && + (!MF.getMMI().getLandingPads().empty() || + (MF.getFunction()->needsUnwindTableEntry() && !TFL->hasFP(MF)))) + return false; + + // You would expect straight-line code between call-frame setup and + // call-frame destroy. You would be wrong. There are circumstances (e.g. + // CMOV_GR8 expansion of a select that feeds a function call!) where we can + // end up with the setup and the destroy in different basic blocks. + // This is bad, and breaks SP adjustment. + // So, check that all of the frames in the function are closed inside + // the same block, and, for good measure, that there are no nested frames. + unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode(); + unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); + for (MachineBasicBlock &BB : MF) { + bool InsideFrameSequence = false; + for (MachineInstr &MI : BB) { + if (MI.getOpcode() == FrameSetupOpcode) { + if (InsideFrameSequence) + return false; + InsideFrameSequence = true; + } else if (MI.getOpcode() == FrameDestroyOpcode) { + if (!InsideFrameSequence) + return false; + InsideFrameSequence = false; + } + } + + if (InsideFrameSequence) + return false; + } + + return true; +} + +// Check whether this trasnformation is profitable for a particular +// function - in terms of code size. +bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, + ContextVector &CallSeqVector) { + // This transformation is always a win when we do not expect to have + // a reserved call frame. Under other circumstances, it may be either + // a win or a loss, and requires a heuristic. + bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects(); + if (CannotReserveFrame) + return true; + + // Don't do this when not optimizing for size. + if (!MF.getFunction()->optForSize()) + return false; + + unsigned StackAlign = TFL->getStackAlignment(); + + int64_t Advantage = 0; + for (auto CC : CallSeqVector) { + // Call sites where no parameters are passed on the stack + // do not affect the cost, since there needs to be no + // stack adjustment. + if (CC.NoStackParams) + continue; + + if (!CC.UsePush) { + // If we don't use pushes for a particular call site, + // we pay for not having a reserved call frame with an + // additional sub/add esp pair. The cost is ~3 bytes per instruction, + // depending on the size of the constant. + // TODO: Callee-pop functions should have a smaller penalty, because + // an add is needed even with a reserved call frame. + Advantage -= 6; + } else { + // We can use pushes. First, account for the fixed costs. + // We'll need a add after the call. + Advantage -= 3; + // If we have to realign the stack, we'll also need and sub before + if (CC.ExpectedDist % StackAlign) + Advantage -= 3; + // Now, for each push, we save ~3 bytes. For small constants, we actually, + // save more (up to 5 bytes), but 3 should be a good approximation. + Advantage += (CC.ExpectedDist / 4) * 3; + } + } + + return (Advantage >= 0); +} + +bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { + STI = &MF.getSubtarget<X86Subtarget>(); + TII = STI->getInstrInfo(); + TFL = STI->getFrameLowering(); + MRI = &MF.getRegInfo(); + + if (!isLegal(MF)) + return false; + + unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode(); + + bool Changed = false; + + ContextVector CallSeqVector; + + for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) + for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) + if (I->getOpcode() == FrameSetupOpcode) { + CallContext Context; + collectCallInfo(MF, *BB, I, Context); + CallSeqVector.push_back(Context); + } + + if (!isProfitable(MF, CallSeqVector)) + return false; + + for (auto CC : CallSeqVector) + if (CC.UsePush) + Changed |= adjustCallSequence(MF, CC); + + return Changed; +} + +X86CallFrameOptimization::InstClassification +X86CallFrameOptimization::classifyInstruction( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const X86RegisterInfo &RegInfo, DenseSet<unsigned int> &UsedRegs) { + if (MI == MBB.end()) + return Exit; + + // The instructions we actually care about are movs onto the stack + int Opcode = MI->getOpcode(); + if (Opcode == X86::MOV32mi || Opcode == X86::MOV32mr) + return Convert; + + // Not all calling conventions have only stack MOVs between the stack + // adjust and the call. + + // We want to tolerate other instructions, to cover more cases. + // In particular: + // a) PCrel calls, where we expect an additional COPY of the basereg. + // b) Passing frame-index addresses. + // c) Calling conventions that have inreg parameters. These generate + // both copies and movs into registers. + // To avoid creating lots of special cases, allow any instruction + // that does not write into memory, does not def or use the stack + // pointer, and does not def any register that was used by a preceding + // push. + // (Reading from memory is allowed, even if referenced through a + // frame index, since these will get adjusted properly in PEI) + + // The reason for the last condition is that the pushes can't replace + // the movs in place, because the order must be reversed. + // So if we have a MOV32mr that uses EDX, then an instruction that defs + // EDX, and then the call, after the transformation the push will use + // the modified version of EDX, and not the original one. + // Since we are still in SSA form at this point, we only need to + // make sure we don't clobber any *physical* registers that were + // used by an earlier mov that will become a push. + + if (MI->isCall() || MI->mayStore()) + return Exit; + + for (const MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + unsigned int Reg = MO.getReg(); + if (!RegInfo.isPhysicalRegister(Reg)) + continue; + if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister())) + return Exit; + if (MO.isDef()) { + for (unsigned int U : UsedRegs) + if (RegInfo.regsOverlap(Reg, U)) + return Exit; + } + } + + return Skip; +} + +void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + CallContext &Context) { + // Check that this particular call sequence is amenable to the + // transformation. + const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( + STI->getRegisterInfo()); + unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); + + // We expect to enter this at the beginning of a call sequence + assert(I->getOpcode() == TII->getCallFrameSetupOpcode()); + MachineBasicBlock::iterator FrameSetup = I++; + Context.FrameSetup = FrameSetup; + + // How much do we adjust the stack? This puts an upper bound on + // the number of parameters actually passed on it. + unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4; + + // A zero adjustment means no stack parameters + if (!MaxAdjust) { + Context.NoStackParams = true; + return; + } + + // For globals in PIC mode, we can have some LEAs here. + // Ignore them, they don't bother us. + // TODO: Extend this to something that covers more cases. + while (I->getOpcode() == X86::LEA32r) + ++I; + + // We expect a copy instruction here. + // TODO: The copy instruction is a lowering artifact. + // We should also support a copy-less version, where the stack + // pointer is used directly. + if (!I->isCopy() || !I->getOperand(0).isReg()) + return; + Context.SPCopy = I++; + + unsigned StackPtr = Context.SPCopy->getOperand(0).getReg(); + + // Scan the call setup sequence for the pattern we're looking for. + // We only handle a simple case - a sequence of MOV32mi or MOV32mr + // instructions, that push a sequence of 32-bit values onto the stack, with + // no gaps between them. + if (MaxAdjust > 4) + Context.MovVector.resize(MaxAdjust, nullptr); + + InstClassification Classification; + DenseSet<unsigned int> UsedRegs; + + while ((Classification = classifyInstruction(MBB, I, RegInfo, UsedRegs)) != + Exit) { + if (Classification == Skip) { + ++I; + continue; + } + + // We know the instruction is a MOV32mi/MOV32mr. + // We only want movs of the form: + // movl imm/r32, k(%esp) + // If we run into something else, bail. + // Note that AddrBaseReg may, counter to its name, not be a register, + // but rather a frame index. + // TODO: Support the fi case. This should probably work now that we + // have the infrastructure to track the stack pointer within a call + // sequence. + if (!I->getOperand(X86::AddrBaseReg).isReg() || + (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) || + !I->getOperand(X86::AddrScaleAmt).isImm() || + (I->getOperand(X86::AddrScaleAmt).getImm() != 1) || + (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) || + (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) || + !I->getOperand(X86::AddrDisp).isImm()) + return; + + int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm(); + assert(StackDisp >= 0 && + "Negative stack displacement when passing parameters"); + + // We really don't want to consider the unaligned case. + if (StackDisp % 4) + return; + StackDisp /= 4; + + assert((size_t)StackDisp < Context.MovVector.size() && + "Function call has more parameters than the stack is adjusted for."); + + // If the same stack slot is being filled twice, something's fishy. + if (Context.MovVector[StackDisp] != nullptr) + return; + Context.MovVector[StackDisp] = I; + + for (const MachineOperand &MO : I->uses()) { + if (!MO.isReg()) + continue; + unsigned int Reg = MO.getReg(); + if (RegInfo.isPhysicalRegister(Reg)) + UsedRegs.insert(Reg); + } + + ++I; + } + + // We now expect the end of the sequence. If we stopped early, + // or reached the end of the block without finding a call, bail. + if (I == MBB.end() || !I->isCall()) + return; + + Context.Call = I; + if ((++I)->getOpcode() != FrameDestroyOpcode) + return; + + // Now, go through the vector, and see that we don't have any gaps, + // but only a series of 32-bit MOVs. + auto MMI = Context.MovVector.begin(), MME = Context.MovVector.end(); + for (; MMI != MME; ++MMI, Context.ExpectedDist += 4) + if (*MMI == nullptr) + break; + + // If the call had no parameters, do nothing + if (MMI == Context.MovVector.begin()) + return; + + // We are either at the last parameter, or a gap. + // Make sure it's not a gap + for (; MMI != MME; ++MMI) + if (*MMI != nullptr) + return; + + Context.UsePush = true; + return; +} + +bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, + const CallContext &Context) { + // Ok, we can in fact do the transformation for this call. + // Do not remove the FrameSetup instruction, but adjust the parameters. + // PEI will end up finalizing the handling of this. + MachineBasicBlock::iterator FrameSetup = Context.FrameSetup; + MachineBasicBlock &MBB = *(FrameSetup->getParent()); + FrameSetup->getOperand(1).setImm(Context.ExpectedDist); + + DebugLoc DL = FrameSetup->getDebugLoc(); + // Now, iterate through the vector in reverse order, and replace the movs + // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to + // replace uses. + for (int Idx = (Context.ExpectedDist / 4) - 1; Idx >= 0; --Idx) { + MachineBasicBlock::iterator MOV = *Context.MovVector[Idx]; + MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); + MachineBasicBlock::iterator Push = nullptr; + if (MOV->getOpcode() == X86::MOV32mi) { + unsigned PushOpcode = X86::PUSHi32; + // If the operand is a small (8-bit) immediate, we can use a + // PUSH instruction with a shorter encoding. + // Note that isImm() may fail even though this is a MOVmi, because + // the operand can also be a symbol. + if (PushOp.isImm()) { + int64_t Val = PushOp.getImm(); + if (isInt<8>(Val)) + PushOpcode = X86::PUSH32i8; + } + Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)) + .addOperand(PushOp); + } else { + unsigned int Reg = PushOp.getReg(); + + // If PUSHrmm is not slow on this target, try to fold the source of the + // push into the instruction. + bool SlowPUSHrmm = STI->isAtom() || STI->isSLM(); + + // Check that this is legal to fold. Right now, we're extremely + // conservative about that. + MachineInstr *DefMov = nullptr; + if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) { + Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm)); + + unsigned NumOps = DefMov->getDesc().getNumOperands(); + for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) + Push->addOperand(DefMov->getOperand(i)); + + DefMov->eraseFromParent(); + } else { + Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r)) + .addReg(Reg) + .getInstr(); + } + } + + // For debugging, when using SP-based CFA, we need to adjust the CFA + // offset after each push. + // TODO: This is needed only if we require precise CFA. + if (!TFL->hasFP(MF)) + TFL->BuildCFI(MBB, std::next(Push), DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, 4)); + + MBB.erase(MOV); + } + + // The stack-pointer copy is no longer used in the call sequences. + // There should not be any other users, but we can't commit to that, so: + if (MRI->use_empty(Context.SPCopy->getOperand(0).getReg())) + Context.SPCopy->eraseFromParent(); + + // Once we've done this, we need to make sure PEI doesn't assume a reserved + // frame. + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + FuncInfo->setHasPushSequences(true); + + return true; +} + +MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush( + MachineBasicBlock::iterator FrameSetup, unsigned Reg) { + // Do an extremely restricted form of load folding. + // ISel will often create patterns like: + // movl 4(%edi), %eax + // movl 8(%edi), %ecx + // movl 12(%edi), %edx + // movl %edx, 8(%esp) + // movl %ecx, 4(%esp) + // movl %eax, (%esp) + // call + // Get rid of those with prejudice. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return nullptr; + + // Make sure this is the only use of Reg. + if (!MRI->hasOneNonDBGUse(Reg)) + return nullptr; + + MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg); + + // Make sure the def is a MOV from memory. + // If the def is an another block, give up. + if (DefMI->getOpcode() != X86::MOV32rm || + DefMI->getParent() != FrameSetup->getParent()) + return nullptr; + + // Make sure we don't have any instructions between DefMI and the + // push that make folding the load illegal. + for (auto I = DefMI; I != FrameSetup; ++I) + if (I->isLoadFoldBarrier()) + return nullptr; + + return DefMI; +} diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.h b/contrib/llvm/lib/Target/X86/X86CallingConv.h new file mode 100644 index 0000000..a08160f --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.h @@ -0,0 +1,107 @@ +//=== X86CallingConv.h - X86 Custom Calling Convention Routines -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the custom routines for the X86 Calling Convention that +// aren't done by tablegen. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86CALLINGCONV_H +#define LLVM_LIB_TARGET_X86_X86CALLINGCONV_H + +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/IR/CallingConv.h" + +namespace llvm { + +inline bool CC_X86_32_VectorCallIndirect(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + // Similar to CCPassIndirect, with the addition of inreg. + LocVT = MVT::i32; + LocInfo = CCValAssign::Indirect; + ArgFlags.setInReg(); + return false; // Continue the search, but now for i32. +} + + +inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &, + CCValAssign::LocInfo &, ISD::ArgFlagsTy &, + CCState &) { + llvm_unreachable("The AnyReg calling convention is only supported by the " \ + "stackmap and patchpoint intrinsics."); + // gracefully fallback to X86 C calling convention on Release builds. + return false; +} + +inline bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure + // not to split i64 and double between a register and stack + static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX}; + static const unsigned NumRegs = sizeof(RegList)/sizeof(RegList[0]); + + SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); + + // If this is the first part of an double/i64/i128, or if we're already + // in the middle of a split, add to the pending list. If this is not + // the end of the split, return, otherwise go on to process the pending + // list + if (ArgFlags.isSplit() || !PendingMembers.empty()) { + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + if (!ArgFlags.isSplitEnd()) + return true; + } + + // If there are no pending members, we are not in the middle of a split, + // so do the usual inreg stuff. + if (PendingMembers.empty()) { + if (unsigned Reg = State.AllocateReg(RegList)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return true; + } + return false; + } + + assert(ArgFlags.isSplitEnd()); + + // We now have the entire original argument in PendingMembers, so decide + // whether to use registers or the stack. + // Per the MCU ABI: + // a) To use registers, we need to have enough of them free to contain + // the entire argument. + // b) We never want to use more than 2 registers for a single argument. + + unsigned FirstFree = State.getFirstUnallocated(RegList); + bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree); + + for (auto &It : PendingMembers) { + if (UseRegs) + It.convertToReg(State.AllocateReg(RegList[FirstFree++])); + else + It.convertToMem(State.AllocateStack(4, 4)); + State.addLoc(It); + } + + PendingMembers.clear(); + + return true; +} + +} // End llvm namespace + +#endif + diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td new file mode 100644 index 0000000..54d88cb --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td @@ -0,0 +1,881 @@ +//===-- X86CallingConv.td - Calling Conventions X86 32/64 --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This describes the calling conventions for the X86-32 and X86-64 +// architectures. +// +//===----------------------------------------------------------------------===// + +/// CCIfSubtarget - Match if the current subtarget has a feature F. +class CCIfSubtarget<string F, CCAction A> + : CCIf<!strconcat("static_cast<const X86Subtarget&>" + "(State.getMachineFunction().getSubtarget()).", F), + A>; + +//===----------------------------------------------------------------------===// +// Return Value Calling Conventions +//===----------------------------------------------------------------------===// + +// Return-value conventions common to all X86 CC's. +def RetCC_X86Common : CallingConv<[ + // Scalar values are returned in AX first, then DX. For i8, the ABI + // requires the values to be in AL and AH, however this code uses AL and DL + // instead. This is because using AH for the second register conflicts with + // the way LLVM does multiple return values -- a return of {i16,i8} would end + // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI + // for functions that return two i8 values are currently expected to pack the + // values into an i16 (which uses AX, and thus AL:AH). + // + // For code that doesn't care about the ABI, we allow returning more than two + // integer values in registers. + CCIfType<[i1], CCPromoteToType<i8>>, + CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>, + CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>, + CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>, + CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX]>>, + + // Boolean vectors of AVX-512 are returned in SIMD registers. + // The call from AVX to AVX-512 function should work, + // since the boolean types in AVX/AVX2 are promoted by default. + CCIfType<[v2i1], CCPromoteToType<v2i64>>, + CCIfType<[v4i1], CCPromoteToType<v4i32>>, + CCIfType<[v8i1], CCPromoteToType<v8i16>>, + CCIfType<[v16i1], CCPromoteToType<v16i8>>, + CCIfType<[v32i1], CCPromoteToType<v32i8>>, + CCIfType<[v64i1], CCPromoteToType<v64i8>>, + + // Vector types are returned in XMM0 and XMM1, when they fit. XMM2 and XMM3 + // can only be used by ABI non-compliant code. If the target doesn't have XMM + // registers, it won't have vector types. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, + + // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3 + // can only be used by ABI non-compliant code. This vector type is only + // supported while using the AVX target feature. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + + // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3 + // can only be used by ABI non-compliant code. This vector type is only + // supported while using the AVX-512 target feature. + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, + + // MMX vector types are always returned in MM0. If the target doesn't have + // MM0, it doesn't support these vector types. + CCIfType<[x86mmx], CCAssignToReg<[MM0]>>, + + // Long double types are always returned in FP0 (even with SSE). + CCIfType<[f80], CCAssignToReg<[FP0, FP1]>> +]>; + +// X86-32 C return-value convention. +def RetCC_X86_32_C : CallingConv<[ + // The X86-32 calling convention returns FP values in FP0, unless marked + // with "inreg" (used here to distinguish one kind of reg from another, + // weirdly; this is really the sse-regparm calling convention) in which + // case they use XMM0, otherwise it is the same as the common X86 calling + // conv. + CCIfInReg<CCIfSubtarget<"hasSSE2()", + CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>, + CCIfType<[f32,f64], CCAssignToReg<[FP0, FP1]>>, + CCDelegateTo<RetCC_X86Common> +]>; + +// X86-32 FastCC return-value convention. +def RetCC_X86_32_Fast : CallingConv<[ + // The X86-32 fastcc returns 1, 2, or 3 FP values in XMM0-2 if the target has + // SSE2. + // This can happen when a float, 2 x float, or 3 x float vector is split by + // target lowering, and is returned in 1-3 sse regs. + CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, + CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, + + // For integers, ECX can be used as an extra return register + CCIfType<[i8], CCAssignToReg<[AL, DL, CL]>>, + CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>, + CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>, + + // Otherwise, it is the same as the common X86 calling convention. + CCDelegateTo<RetCC_X86Common> +]>; + +// Intel_OCL_BI return-value convention. +def RetCC_Intel_OCL_BI : CallingConv<[ + // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3. + CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, + + // 256-bit FP vectors + // No more than 4 registers + CCIfType<[v8f32, v4f64, v8i32, v4i64], + CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + + // 512-bit FP vectors + CCIfType<[v16f32, v8f64, v16i32, v8i64], + CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, + + // i32, i64 in the standard way + CCDelegateTo<RetCC_X86Common> +]>; + +// X86-32 HiPE return-value convention. +def RetCC_X86_32_HiPE : CallingConv<[ + // Promote all types to i32 + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Return: HP, P, VAL1, VAL2 + CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>> +]>; + +// X86-32 HiPE return-value convention. +def RetCC_X86_32_VectorCall : CallingConv<[ + // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, + + // 256-bit FP vectors + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + + // 512-bit FP vectors + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, + + // Return integers in the standard way. + CCDelegateTo<RetCC_X86Common> +]>; + +// X86-64 C return-value convention. +def RetCC_X86_64_C : CallingConv<[ + // The X86-64 calling convention always returns FP values in XMM0. + CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>, + CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>, + CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>, + + // MMX vector types are always returned in XMM0. + CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>, + CCDelegateTo<RetCC_X86Common> +]>; + +// X86-Win64 C return-value convention. +def RetCC_X86_Win64_C : CallingConv<[ + // The X86-Win64 calling convention always returns __m64 values in RAX. + CCIfType<[x86mmx], CCBitConvertToType<i64>>, + + // Otherwise, everything is the same as 'normal' X86-64 C CC. + CCDelegateTo<RetCC_X86_64_C> +]>; + +// X86-64 HiPE return-value convention. +def RetCC_X86_64_HiPE : CallingConv<[ + // Promote all types to i64 + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Return: HP, P, VAL1, VAL2 + CCIfType<[i64], CCAssignToReg<[R15, RBP, RAX, RDX]>> +]>; + +// X86-64 WebKit_JS return-value convention. +def RetCC_X86_64_WebKit_JS : CallingConv<[ + // Promote all types to i64 + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Return: RAX + CCIfType<[i64], CCAssignToReg<[RAX]>> +]>; + +// X86-64 AnyReg return-value convention. No explicit register is specified for +// the return-value. The register allocator is allowed and expected to choose +// any free register. +// +// This calling convention is currently only supported by the stackmap and +// patchpoint intrinsics. All other uses will result in an assert on Debug +// builds. On Release builds we fallback to the X86 C calling convention. +def RetCC_X86_64_AnyReg : CallingConv<[ + CCCustom<"CC_X86_AnyReg_Error"> +]>; + +// X86-64 HHVM return-value convention. +def RetCC_X86_64_HHVM: CallingConv<[ + // Promote all types to i64 + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Return: could return in any GP register save RSP and R12. + CCIfType<[i64], CCAssignToReg<[RBX, RBP, RDI, RSI, RDX, RCX, R8, R9, + RAX, R10, R11, R13, R14, R15]>> +]>; + +// This is the root return-value convention for the X86-32 backend. +def RetCC_X86_32 : CallingConv<[ + // If FastCC, use RetCC_X86_32_Fast. + CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>, + // If HiPE, use RetCC_X86_32_HiPE. + CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>, + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>, + + // Otherwise, use RetCC_X86_32_C. + CCDelegateTo<RetCC_X86_32_C> +]>; + +// This is the root return-value convention for the X86-64 backend. +def RetCC_X86_64 : CallingConv<[ + // HiPE uses RetCC_X86_64_HiPE + CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>, + + // Handle JavaScript calls. + CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<RetCC_X86_64_WebKit_JS>>, + CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_X86_64_AnyReg>>, + + // Handle explicit CC selection + CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>, + CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>, + + // Handle HHVM calls. + CCIfCC<"CallingConv::HHVM", CCDelegateTo<RetCC_X86_64_HHVM>>, + + // Mingw64 and native Win64 use Win64 CC + CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>, + + // Otherwise, drop to normal X86-64 CC + CCDelegateTo<RetCC_X86_64_C> +]>; + +// This is the return-value convention used for the entire X86 backend. +def RetCC_X86 : CallingConv<[ + + // Check if this is the Intel OpenCL built-ins calling convention + CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>, + + CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>, + CCDelegateTo<RetCC_X86_32> +]>; + +//===----------------------------------------------------------------------===// +// X86-64 Argument Calling Conventions +//===----------------------------------------------------------------------===// + +def CC_X86_64_C : CallingConv<[ + // Handles byval parameters. + CCIfByVal<CCPassByVal<8, 8>>, + + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in R10. + CCIfNest<CCIfSubtarget<"isTarget64BitILP32()", CCAssignToReg<[R10D]>>>, + CCIfNest<CCAssignToReg<[R10]>>, + + // The first 6 integer arguments are passed in integer registers. + CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>, + CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>, + + // The first 8 MMX vector arguments are passed in XMM registers on Darwin. + CCIfType<[x86mmx], + CCIfSubtarget<"isTargetDarwin()", + CCIfSubtarget<"hasSSE2()", + CCPromoteToType<v2i64>>>>, + + // Boolean vectors of AVX-512 are passed in SIMD registers. + // The call from AVX to AVX-512 function should work, + // since the boolean types in AVX/AVX2 are promoted by default. + CCIfType<[v2i1], CCPromoteToType<v2i64>>, + CCIfType<[v4i1], CCPromoteToType<v4i32>>, + CCIfType<[v8i1], CCPromoteToType<v8i16>>, + CCIfType<[v16i1], CCPromoteToType<v16i8>>, + CCIfType<[v32i1], CCPromoteToType<v32i8>>, + CCIfType<[v64i1], CCPromoteToType<v64i8>>, + + // The first 8 FP/Vector arguments are passed in XMM registers. + CCIfType<[f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfSubtarget<"hasSSE1()", + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>, + + // The first 8 256-bit vector arguments are passed in YMM registers, unless + // this is a vararg function. + // FIXME: This isn't precisely correct; the x86-64 ABI document says that + // fixed arguments to vararg functions are supposed to be passed in + // registers. Actually modeling that would be a lot of work, though. + CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasFp256()", + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, + YMM4, YMM5, YMM6, YMM7]>>>>, + + // The first 8 512-bit vector arguments are passed in ZMM registers. + CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCIfSubtarget<"hasAVX512()", + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]>>>>, + + // Integer/FP values get stored in stack slots that are 8 bytes in size and + // 8-byte aligned if there are no more registers to hold them. + CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, + + // Long doubles get stack slots whose size and alignment depends on the + // subtarget. + CCIfType<[f80, f128], CCAssignToStack<0, 0>>, + + // Vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, + + // 256-bit vectors get 32-byte stack slots that are 32-byte aligned. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToStack<32, 32>>, + + // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. + CCIfType<[v16i32, v8i64, v16f32, v8f64], + CCAssignToStack<64, 64>> +]>; + +// Calling convention for X86-64 HHVM. +def CC_X86_64_HHVM : CallingConv<[ + // Use all/any GP registers for args, except RSP. + CCIfType<[i64], CCAssignToReg<[RBX, R12, RBP, R15, + RDI, RSI, RDX, RCX, R8, R9, + RAX, R10, R11, R13, R14]>> +]>; + +// Calling convention for helper functions in HHVM. +def CC_X86_64_HHVM_C : CallingConv<[ + // Pass the first argument in RBP. + CCIfType<[i64], CCAssignToReg<[RBP]>>, + + // Otherwise it's the same as the regular C calling convention. + CCDelegateTo<CC_X86_64_C> +]>; + +// Calling convention used on Win64 +def CC_X86_Win64_C : CallingConv<[ + // FIXME: Handle byval stuff. + // FIXME: Handle varargs. + + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in R10. + CCIfNest<CCAssignToReg<[R10]>>, + + // 128 bit vectors are passed by pointer + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>, + + + // 256 bit vectors are passed by pointer + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>, + + // 512 bit vectors are passed by pointer + CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>, + + // The first 4 MMX vector arguments are passed in GPRs. + CCIfType<[x86mmx], CCBitConvertToType<i64>>, + + // The first 4 integer arguments are passed in integer registers. + CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ], + [XMM0, XMM1, XMM2, XMM3]>>, + + // Do not pass the sret argument in RCX, the Win64 thiscall calling + // convention requires "this" to be passed in RCX. + CCIfCC<"CallingConv::X86_ThisCall", + CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[RDX , R8 , R9 ], + [XMM1, XMM2, XMM3]>>>>, + + CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8 , R9 ], + [XMM0, XMM1, XMM2, XMM3]>>, + + // The first 4 FP/Vector arguments are passed in XMM registers. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3], + [RCX , RDX , R8 , R9 ]>>, + + // Integer/FP values get stored in stack slots that are 8 bytes in size and + // 8-byte aligned if there are no more registers to hold them. + CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, + + // Long doubles get stack slots whose size and alignment depends on the + // subtarget. + CCIfType<[f80], CCAssignToStack<0, 0>> +]>; + +def CC_X86_Win64_VectorCall : CallingConv<[ + // The first 6 floating point and vector types of 128 bits or less use + // XMM0-XMM5. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>, + + // 256-bit vectors use YMM registers. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>, + + // 512-bit vectors use ZMM registers. + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>, + + // Delegate to fastcall to handle integer types. + CCDelegateTo<CC_X86_Win64_C> +]>; + + +def CC_X86_64_GHC : CallingConv<[ + // Promote i8/i16/i32 arguments to i64. + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim + CCIfType<[i64], + CCAssignToReg<[R13, RBP, R12, RBX, R14, RSI, RDI, R8, R9, R15]>>, + + // Pass in STG registers: F1, F2, F3, F4, D1, D2 + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfSubtarget<"hasSSE1()", + CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>> +]>; + +def CC_X86_64_HiPE : CallingConv<[ + // Promote i8/i16/i32 arguments to i64. + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2, ARG3 + CCIfType<[i64], CCAssignToReg<[R15, RBP, RSI, RDX, RCX, R8]>>, + + // Integer/FP values get stored in stack slots that are 8 bytes in size and + // 8-byte aligned if there are no more registers to hold them. + CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>> +]>; + +def CC_X86_64_WebKit_JS : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Only the first integer argument is passed in register. + CCIfType<[i32], CCAssignToReg<[EAX]>>, + CCIfType<[i64], CCAssignToReg<[RAX]>>, + + // The remaining integer arguments are passed on the stack. 32bit integer and + // floating-point arguments are aligned to 4 byte and stored in 4 byte slots. + // 64bit integer and floating-point arguments are aligned to 8 byte and stored + // in 8 byte stack slots. + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + CCIfType<[i64, f64], CCAssignToStack<8, 8>> +]>; + +// No explicit register is specified for the AnyReg calling convention. The +// register allocator may assign the arguments to any free register. +// +// This calling convention is currently only supported by the stackmap and +// patchpoint intrinsics. All other uses will result in an assert on Debug +// builds. On Release builds we fallback to the X86 C calling convention. +def CC_X86_64_AnyReg : CallingConv<[ + CCCustom<"CC_X86_AnyReg_Error"> +]>; + +//===----------------------------------------------------------------------===// +// X86 C Calling Convention +//===----------------------------------------------------------------------===// + +/// CC_X86_32_Vector_Common - In all X86-32 calling conventions, extra vector +/// values are spilled on the stack. +def CC_X86_32_Vector_Common : CallingConv<[ + // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, + + // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToStack<32, 32>>, + + // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned. + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToStack<64, 64>> +]>; + +// CC_X86_32_Vector_Standard - The first 3 vector arguments are passed in +// vector registers +def CC_X86_32_Vector_Standard : CallingConv<[ + // SSE vector arguments are passed in XMM registers. + CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2]>>>, + + // AVX 256-bit vector arguments are passed in YMM registers. + CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasFp256()", + CCAssignToReg<[YMM0, YMM1, YMM2]>>>>, + + // AVX 512-bit vector arguments are passed in ZMM registers. + CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2]>>>, + + CCDelegateTo<CC_X86_32_Vector_Common> +]>; + +// CC_X86_32_Vector_Darwin - The first 4 vector arguments are passed in +// vector registers. +def CC_X86_32_Vector_Darwin : CallingConv<[ + // SSE vector arguments are passed in XMM registers. + CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>, + + // AVX 256-bit vector arguments are passed in YMM registers. + CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasFp256()", + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>, + + // AVX 512-bit vector arguments are passed in ZMM registers. + CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>>, + + CCDelegateTo<CC_X86_32_Vector_Common> +]>; + +/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP +/// values are spilled on the stack. +def CC_X86_32_Common : CallingConv<[ + // Handles byval parameters. + CCIfByVal<CCPassByVal<4, 4>>, + + // The first 3 float or double arguments, if marked 'inreg' and if the call + // is not a vararg call and if SSE2 is available, are passed in SSE registers. + CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64], + CCIfSubtarget<"hasSSE2()", + CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>, + + // The first 3 __m64 vector arguments are passed in mmx registers if the + // call is not a vararg call. + CCIfNotVarArg<CCIfType<[x86mmx], + CCAssignToReg<[MM0, MM1, MM2]>>>, + + // Integer/Float values get stored in stack slots that are 4 bytes in + // size and 4-byte aligned. + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + + // Doubles get 8-byte slots that are 4-byte aligned. + CCIfType<[f64], CCAssignToStack<8, 4>>, + + // Long doubles get slots whose size depends on the subtarget. + CCIfType<[f80], CCAssignToStack<0, 4>>, + + // Boolean vectors of AVX-512 are passed in SIMD registers. + // The call from AVX to AVX-512 function should work, + // since the boolean types in AVX/AVX2 are promoted by default. + CCIfType<[v2i1], CCPromoteToType<v2i64>>, + CCIfType<[v4i1], CCPromoteToType<v4i32>>, + CCIfType<[v8i1], CCPromoteToType<v8i16>>, + CCIfType<[v16i1], CCPromoteToType<v16i8>>, + CCIfType<[v32i1], CCPromoteToType<v32i8>>, + CCIfType<[v64i1], CCPromoteToType<v64i8>>, + + // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are + // passed in the parameter area. + CCIfType<[x86mmx], CCAssignToStack<8, 4>>, + + // Darwin passes vectors in a form that differs from the i386 psABI + CCIfSubtarget<"isTargetDarwin()", CCDelegateTo<CC_X86_32_Vector_Darwin>>, + + // Otherwise, drop to 'normal' X86-32 CC + CCDelegateTo<CC_X86_32_Vector_Standard> +]>; + +def CC_X86_32_C : CallingConv<[ + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in ECX. + CCIfNest<CCAssignToReg<[ECX]>>, + + // The first 3 integer arguments, if marked 'inreg' and if the call is not + // a vararg call, are passed in integer registers. + CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + +def CC_X86_32_MCU : CallingConv<[ + // Handles byval parameters. Note that, like FastCC, we can't rely on + // the delegation to CC_X86_32_Common because that happens after code that + // puts arguments in registers. + CCIfByVal<CCPassByVal<4, 4>>, + + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + + // If the call is not a vararg call, some arguments may be passed + // in integer registers. + CCIfNotVarArg<CCIfType<[i32], CCCustom<"CC_X86_32_MCUInReg">>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + +def CC_X86_32_FastCall : CallingConv<[ + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in EAX. + CCIfNest<CCAssignToReg<[EAX]>>, + + // The first 2 integer arguments are passed in ECX/EDX + CCIfInReg<CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + +def CC_X86_32_VectorCall : CallingConv<[ + // The first 6 floating point and vector types of 128 bits or less use + // XMM0-XMM5. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>, + + // 256-bit vectors use YMM registers. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>, + + // 512-bit vectors use ZMM registers. + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>, + + // Otherwise, pass it indirectly. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, + v32i8, v16i16, v8i32, v4i64, v8f32, v4f64, + v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCCustom<"CC_X86_32_VectorCallIndirect">>, + + // Delegate to fastcall to handle integer types. + CCDelegateTo<CC_X86_32_FastCall> +]>; + +def CC_X86_32_ThisCall_Common : CallingConv<[ + // The first integer argument is passed in ECX + CCIfType<[i32], CCAssignToReg<[ECX]>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + +def CC_X86_32_ThisCall_Mingw : CallingConv<[ + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + + CCDelegateTo<CC_X86_32_ThisCall_Common> +]>; + +def CC_X86_32_ThisCall_Win : CallingConv<[ + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + + // Pass sret arguments indirectly through stack. + CCIfSRet<CCAssignToStack<4, 4>>, + + CCDelegateTo<CC_X86_32_ThisCall_Common> +]>; + +def CC_X86_32_ThisCall : CallingConv<[ + CCIfSubtarget<"isTargetCygMing()", CCDelegateTo<CC_X86_32_ThisCall_Mingw>>, + CCDelegateTo<CC_X86_32_ThisCall_Win> +]>; + +def CC_X86_32_FastCC : CallingConv<[ + // Handles byval parameters. Note that we can't rely on the delegation + // to CC_X86_32_Common for this because that happens after code that + // puts arguments in registers. + CCIfByVal<CCPassByVal<4, 4>>, + + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in EAX. + CCIfNest<CCAssignToReg<[EAX]>>, + + // The first 2 integer arguments are passed in ECX/EDX + CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>, + + // The first 3 float or double arguments, if the call is not a vararg + // call and if SSE2 is available, are passed in SSE registers. + CCIfNotVarArg<CCIfType<[f32,f64], + CCIfSubtarget<"hasSSE2()", + CCAssignToReg<[XMM0,XMM1,XMM2]>>>>, + + // Doubles get 8-byte slots that are 8-byte aligned. + CCIfType<[f64], CCAssignToStack<8, 8>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + +def CC_X86_32_GHC : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Pass in STG registers: Base, Sp, Hp, R1 + CCIfType<[i32], CCAssignToReg<[EBX, EBP, EDI, ESI]>> +]>; + +def CC_X86_32_HiPE : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2 + CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX, ECX]>>, + + // Integer/Float values get stored in stack slots that are 4 bytes in + // size and 4-byte aligned. + CCIfType<[i32, f32], CCAssignToStack<4, 4>> +]>; + +// X86-64 Intel OpenCL built-ins calling convention. +def CC_Intel_OCL_BI : CallingConv<[ + + CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>, + CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8, R9 ]>>>, + + CCIfType<[i32], CCIfSubtarget<"is64Bit()", CCAssignToReg<[EDI, ESI, EDX, ECX]>>>, + CCIfType<[i64], CCIfSubtarget<"is64Bit()", CCAssignToReg<[RDI, RSI, RDX, RCX]>>>, + + CCIfType<[i32], CCAssignToStack<4, 4>>, + + // The SSE vector arguments are passed in XMM registers. + CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, + + // The 256-bit vector arguments are passed in YMM registers. + CCIfType<[v8f32, v4f64, v8i32, v4i64], + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>, + + // The 512-bit vector arguments are passed in ZMM registers. + CCIfType<[v16f32, v8f64, v16i32, v8i64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>, + + // Pass masks in mask registers + CCIfType<[v16i1, v8i1], CCAssignToReg<[K1]>>, + + CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, + CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64_C>>, + CCDelegateTo<CC_X86_32_C> +]>; + +def CC_X86_32_Intr : CallingConv<[ + CCAssignToStack<4, 4> +]>; + +def CC_X86_64_Intr : CallingConv<[ + CCAssignToStack<8, 8> +]>; + +//===----------------------------------------------------------------------===// +// X86 Root Argument Calling Conventions +//===----------------------------------------------------------------------===// + +// This is the root argument convention for the X86-32 backend. +def CC_X86_32 : CallingConv<[ + CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>, + CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>, + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_32_VectorCall>>, + CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>, + CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>, + CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>, + CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>, + CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>, + + // Otherwise, drop to normal X86-32 CC + CCDelegateTo<CC_X86_32_C> +]>; + +// This is the root argument convention for the X86-64 backend. +def CC_X86_64 : CallingConv<[ + CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>, + CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>, + CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<CC_X86_64_WebKit_JS>>, + CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>, + CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>, + CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>, + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>, + CCIfCC<"CallingConv::HHVM", CCDelegateTo<CC_X86_64_HHVM>>, + CCIfCC<"CallingConv::HHVM_C", CCDelegateTo<CC_X86_64_HHVM_C>>, + CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_64_Intr>>, + + // Mingw64 and native Win64 use Win64 CC + CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, + + // Otherwise, drop to normal X86-64 CC + CCDelegateTo<CC_X86_64_C> +]>; + +// This is the argument convention used for the entire X86 backend. +def CC_X86 : CallingConv<[ + CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>, + CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>, + CCDelegateTo<CC_X86_32> +]>; + +//===----------------------------------------------------------------------===// +// Callee-saved Registers. +//===----------------------------------------------------------------------===// + +def CSR_NoRegs : CalleeSavedRegs<(add)>; + +def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>; +def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>; + +def CSR_32EHRet : CalleeSavedRegs<(add EAX, EDX, CSR_32)>; +def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>; + +def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15, + (sequence "XMM%u", 6, 15))>; + +// The function used by Darwin to obtain the address of a thread-local variable +// uses rdi to pass a single parameter and rax for the return value. All other +// GPRs are preserved. +def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI, + R8, R9, R10, R11)>; + +// All GPRs - except r11 +def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI, + R8, R9, R10, RSP)>; + +// All registers - except r11 +def CSR_64_RT_AllRegs : CalleeSavedRegs<(add CSR_64_RT_MostRegs, + (sequence "XMM%u", 0, 15))>; +def CSR_64_RT_AllRegs_AVX : CalleeSavedRegs<(add CSR_64_RT_MostRegs, + (sequence "YMM%u", 0, 15))>; + +def CSR_64_MostRegs : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10, + R11, R12, R13, R14, R15, RBP, + (sequence "XMM%u", 0, 15))>; + +def CSR_32_AllRegs : CalleeSavedRegs<(add EAX, EBX, ECX, EDX, EBP, ESI, + EDI, ESP)>; +def CSR_32_AllRegs_SSE : CalleeSavedRegs<(add CSR_32_AllRegs, + (sequence "XMM%u", 0, 7))>; + +def CSR_64_AllRegs : CalleeSavedRegs<(add CSR_64_MostRegs, RAX, RSP, + (sequence "XMM%u", 16, 31))>; +def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX, RSP, + (sequence "YMM%u", 0, 31)), + (sequence "XMM%u", 0, 15))>; + +// Standard C + YMM6-15 +def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, + R13, R14, R15, + (sequence "YMM%u", 6, 15))>; + +def CSR_Win64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, + R12, R13, R14, R15, + (sequence "ZMM%u", 6, 21), + K4, K5, K6, K7)>; +//Standard C + XMM 8-15 +def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64, + (sequence "XMM%u", 8, 15))>; + +//Standard C + YMM 8-15 +def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64, + (sequence "YMM%u", 8, 15))>; + +def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RDI, RSI, R14, R15, + (sequence "ZMM%u", 16, 31), + K4, K5, K6, K7)>; + +// Only R12 is preserved for PHP calls in HHVM. +def CSR_64_HHVM : CalleeSavedRegs<(add R12)>; diff --git a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp new file mode 100644 index 0000000..a09d065 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -0,0 +1,198 @@ +//===------- X86ExpandPseudo.cpp - Expand pseudo instructions -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that expands pseudo instructions into target +// instructions to allow proper scheduling, if-conversion, other late +// optimizations, or simply the encoding of the instructions. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86FrameLowering.h" +#include "X86InstrBuilder.h" +#include "X86InstrInfo.h" +#include "X86MachineFunctionInfo.h" +#include "X86Subtarget.h" +#include "llvm/Analysis/EHPersonalities.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved. +#include "llvm/IR/GlobalValue.h" +using namespace llvm; + +#define DEBUG_TYPE "x86-pseudo" + +namespace { +class X86ExpandPseudo : public MachineFunctionPass { +public: + static char ID; + X86ExpandPseudo() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addPreservedID(MachineLoopInfoID); + AU.addPreservedID(MachineDominatorsID); + MachineFunctionPass::getAnalysisUsage(AU); + } + + const X86Subtarget *STI; + const X86InstrInfo *TII; + const X86RegisterInfo *TRI; + const X86FrameLowering *X86FL; + + bool runOnMachineFunction(MachineFunction &Fn) override; + + const char *getPassName() const override { + return "X86 pseudo instruction expansion pass"; + } + +private: + bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); + bool ExpandMBB(MachineBasicBlock &MBB); +}; +char X86ExpandPseudo::ID = 0; +} // End anonymous namespace. + +/// If \p MBBI is a pseudo instruction, this method expands +/// it to the corresponding (sequence of) actual instruction(s). +/// \returns true if \p MBBI has been expanded. +bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + MachineInstr &MI = *MBBI; + unsigned Opcode = MI.getOpcode(); + DebugLoc DL = MBBI->getDebugLoc(); + switch (Opcode) { + default: + return false; + case X86::TCRETURNdi: + case X86::TCRETURNri: + case X86::TCRETURNmi: + case X86::TCRETURNdi64: + case X86::TCRETURNri64: + case X86::TCRETURNmi64: { + bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64; + MachineOperand &JumpTarget = MBBI->getOperand(0); + MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1); + assert(StackAdjust.isImm() && "Expecting immediate value."); + + // Adjust stack pointer. + int StackAdj = StackAdjust.getImm(); + + if (StackAdj) { + // Check for possible merge with preceding ADD instruction. + StackAdj += X86FL->mergeSPUpdates(MBB, MBBI, true); + X86FL->emitSPUpdate(MBB, MBBI, StackAdj, /*InEpilogue=*/true); + } + + // Jump to label or value in register. + bool IsWin64 = STI->isTargetWin64(); + if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdi64) { + unsigned Op = (Opcode == X86::TCRETURNdi) + ? X86::TAILJMPd + : (IsWin64 ? X86::TAILJMPd64_REX : X86::TAILJMPd64); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op)); + if (JumpTarget.isGlobal()) + MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), + JumpTarget.getTargetFlags()); + else { + assert(JumpTarget.isSymbol()); + MIB.addExternalSymbol(JumpTarget.getSymbolName(), + JumpTarget.getTargetFlags()); + } + } else if (Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64) { + unsigned Op = (Opcode == X86::TCRETURNmi) + ? X86::TAILJMPm + : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op)); + for (unsigned i = 0; i != 5; ++i) + MIB.addOperand(MBBI->getOperand(i)); + } else if (Opcode == X86::TCRETURNri64) { + BuildMI(MBB, MBBI, DL, + TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64)) + .addReg(JumpTarget.getReg(), RegState::Kill); + } else { + BuildMI(MBB, MBBI, DL, TII->get(X86::TAILJMPr)) + .addReg(JumpTarget.getReg(), RegState::Kill); + } + + MachineInstr *NewMI = std::prev(MBBI); + NewMI->copyImplicitOps(*MBBI->getParent()->getParent(), MBBI); + + // Delete the pseudo instruction TCRETURN. + MBB.erase(MBBI); + + return true; + } + case X86::EH_RETURN: + case X86::EH_RETURN64: { + MachineOperand &DestAddr = MBBI->getOperand(0); + assert(DestAddr.isReg() && "Offset should be in register!"); + const bool Uses64BitFramePtr = + STI->isTarget64BitLP64() || STI->isTargetNaCl64(); + unsigned StackPtr = TRI->getStackRegister(); + BuildMI(MBB, MBBI, DL, + TII->get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr) + .addReg(DestAddr.getReg()); + // The EH_RETURN pseudo is really removed during the MC Lowering. + return true; + } + case X86::IRET: { + // Adjust stack to erase error code + int64_t StackAdj = MBBI->getOperand(0).getImm(); + X86FL->emitSPUpdate(MBB, MBBI, StackAdj, true); + // Replace pseudo with machine iret + BuildMI(MBB, MBBI, DL, + TII->get(STI->is64Bit() ? X86::IRET64 : X86::IRET32)); + MBB.erase(MBBI); + return true; + } + case X86::EH_RESTORE: { + // Restore ESP and EBP, and optionally ESI if required. + bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality( + MBB.getParent()->getFunction()->getPersonalityFn())); + X86FL->restoreWin32EHStackPointers(MBB, MBBI, DL, /*RestoreSP=*/IsSEH); + MBBI->eraseFromParent(); + return true; + } + } + llvm_unreachable("Previous switch has a fallthrough?"); +} + +/// Expand all pseudo instructions contained in \p MBB. +/// \returns true if any expansion occurred for \p MBB. +bool X86ExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { + bool Modified = false; + + // MBBI may be invalidated by the expansion. + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + MachineBasicBlock::iterator NMBBI = std::next(MBBI); + Modified |= ExpandMI(MBB, MBBI); + MBBI = NMBBI; + } + + return Modified; +} + +bool X86ExpandPseudo::runOnMachineFunction(MachineFunction &MF) { + STI = &static_cast<const X86Subtarget &>(MF.getSubtarget()); + TII = STI->getInstrInfo(); + TRI = STI->getRegisterInfo(); + X86FL = STI->getFrameLowering(); + + bool Modified = false; + for (MachineBasicBlock &MBB : MF) + Modified |= ExpandMBB(MBB); + return Modified; +} + +/// Returns an instance of the pseudo instruction expansion pass. +FunctionPass *llvm::createX86ExpandPseudoPass() { + return new X86ExpandPseudo(); +} diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp new file mode 100644 index 0000000..629d4d3 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp @@ -0,0 +1,3607 @@ +//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the X86-specific support for the FastISel class. Much +// of the target-specific code is generated by tablegen in the file +// X86GenFastISel.inc, which is #included here. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86CallingConv.h" +#include "X86InstrBuilder.h" +#include "X86InstrInfo.h" +#include "X86MachineFunctionInfo.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/FastISel.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Operator.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +namespace { + +class X86FastISel final : public FastISel { + /// Subtarget - Keep a pointer to the X86Subtarget around so that we can + /// make the right decision when generating code for different targets. + const X86Subtarget *Subtarget; + + /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 + /// floating point ops. + /// When SSE is available, use it for f32 operations. + /// When SSE2 is available, use it for f64 operations. + bool X86ScalarSSEf64; + bool X86ScalarSSEf32; + +public: + explicit X86FastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) + : FastISel(funcInfo, libInfo) { + Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>(); + X86ScalarSSEf64 = Subtarget->hasSSE2(); + X86ScalarSSEf32 = Subtarget->hasSSE1(); + } + + bool fastSelectInstruction(const Instruction *I) override; + + /// \brief The specified machine instr operand is a vreg, and that + /// vreg is being provided by the specified load instruction. If possible, + /// try to fold the load as an operand to the instruction, returning true if + /// possible. + bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, + const LoadInst *LI) override; + + bool fastLowerArguments() override; + bool fastLowerCall(CallLoweringInfo &CLI) override; + bool fastLowerIntrinsicCall(const IntrinsicInst *II) override; + +#include "X86GenFastISel.inc" + +private: + bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT, DebugLoc DL); + + bool X86FastEmitLoad(EVT VT, X86AddressMode &AM, MachineMemOperand *MMO, + unsigned &ResultReg, unsigned Alignment = 1); + + bool X86FastEmitStore(EVT VT, const Value *Val, X86AddressMode &AM, + MachineMemOperand *MMO = nullptr, bool Aligned = false); + bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, + X86AddressMode &AM, + MachineMemOperand *MMO = nullptr, bool Aligned = false); + + bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, + unsigned &ResultReg); + + bool X86SelectAddress(const Value *V, X86AddressMode &AM); + bool X86SelectCallAddress(const Value *V, X86AddressMode &AM); + + bool X86SelectLoad(const Instruction *I); + + bool X86SelectStore(const Instruction *I); + + bool X86SelectRet(const Instruction *I); + + bool X86SelectCmp(const Instruction *I); + + bool X86SelectZExt(const Instruction *I); + + bool X86SelectBranch(const Instruction *I); + + bool X86SelectShift(const Instruction *I); + + bool X86SelectDivRem(const Instruction *I); + + bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I); + + bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I); + + bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I); + + bool X86SelectSelect(const Instruction *I); + + bool X86SelectTrunc(const Instruction *I); + + bool X86SelectFPExtOrFPTrunc(const Instruction *I, unsigned Opc, + const TargetRegisterClass *RC); + + bool X86SelectFPExt(const Instruction *I); + bool X86SelectFPTrunc(const Instruction *I); + bool X86SelectSIToFP(const Instruction *I); + + const X86InstrInfo *getInstrInfo() const { + return Subtarget->getInstrInfo(); + } + const X86TargetMachine *getTargetMachine() const { + return static_cast<const X86TargetMachine *>(&TM); + } + + bool handleConstantAddresses(const Value *V, X86AddressMode &AM); + + unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT); + unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT); + unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT); + unsigned fastMaterializeConstant(const Constant *C) override; + + unsigned fastMaterializeAlloca(const AllocaInst *C) override; + + unsigned fastMaterializeFloatZero(const ConstantFP *CF) override; + + /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is + /// computed in an SSE register, not on the X87 floating point stack. + bool isScalarFPTypeInSSEReg(EVT VT) const { + return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 + (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 + } + + bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false); + + bool IsMemcpySmall(uint64_t Len); + + bool TryEmitSmallMemcpy(X86AddressMode DestAM, + X86AddressMode SrcAM, uint64_t Len); + + bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, + const Value *Cond); + + const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB, + X86AddressMode &AM); +}; + +} // end anonymous namespace. + +static std::pair<X86::CondCode, bool> +getX86ConditionCode(CmpInst::Predicate Predicate) { + X86::CondCode CC = X86::COND_INVALID; + bool NeedSwap = false; + switch (Predicate) { + default: break; + // Floating-point Predicates + case CmpInst::FCMP_UEQ: CC = X86::COND_E; break; + case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through + case CmpInst::FCMP_OGT: CC = X86::COND_A; break; + case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through + case CmpInst::FCMP_OGE: CC = X86::COND_AE; break; + case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through + case CmpInst::FCMP_ULT: CC = X86::COND_B; break; + case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through + case CmpInst::FCMP_ULE: CC = X86::COND_BE; break; + case CmpInst::FCMP_ONE: CC = X86::COND_NE; break; + case CmpInst::FCMP_UNO: CC = X86::COND_P; break; + case CmpInst::FCMP_ORD: CC = X86::COND_NP; break; + case CmpInst::FCMP_OEQ: // fall-through + case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break; + + // Integer Predicates + case CmpInst::ICMP_EQ: CC = X86::COND_E; break; + case CmpInst::ICMP_NE: CC = X86::COND_NE; break; + case CmpInst::ICMP_UGT: CC = X86::COND_A; break; + case CmpInst::ICMP_UGE: CC = X86::COND_AE; break; + case CmpInst::ICMP_ULT: CC = X86::COND_B; break; + case CmpInst::ICMP_ULE: CC = X86::COND_BE; break; + case CmpInst::ICMP_SGT: CC = X86::COND_G; break; + case CmpInst::ICMP_SGE: CC = X86::COND_GE; break; + case CmpInst::ICMP_SLT: CC = X86::COND_L; break; + case CmpInst::ICMP_SLE: CC = X86::COND_LE; break; + } + + return std::make_pair(CC, NeedSwap); +} + +static std::pair<unsigned, bool> +getX86SSEConditionCode(CmpInst::Predicate Predicate) { + unsigned CC; + bool NeedSwap = false; + + // SSE Condition code mapping: + // 0 - EQ + // 1 - LT + // 2 - LE + // 3 - UNORD + // 4 - NEQ + // 5 - NLT + // 6 - NLE + // 7 - ORD + switch (Predicate) { + default: llvm_unreachable("Unexpected predicate"); + case CmpInst::FCMP_OEQ: CC = 0; break; + case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through + case CmpInst::FCMP_OLT: CC = 1; break; + case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through + case CmpInst::FCMP_OLE: CC = 2; break; + case CmpInst::FCMP_UNO: CC = 3; break; + case CmpInst::FCMP_UNE: CC = 4; break; + case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through + case CmpInst::FCMP_UGE: CC = 5; break; + case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through + case CmpInst::FCMP_UGT: CC = 6; break; + case CmpInst::FCMP_ORD: CC = 7; break; + case CmpInst::FCMP_UEQ: + case CmpInst::FCMP_ONE: CC = 8; break; + } + + return std::make_pair(CC, NeedSwap); +} + +/// \brief Adds a complex addressing mode to the given machine instr builder. +/// Note, this will constrain the index register. If its not possible to +/// constrain the given index register, then a new one will be created. The +/// IndexReg field of the addressing mode will be updated to match in this case. +const MachineInstrBuilder & +X86FastISel::addFullAddress(const MachineInstrBuilder &MIB, + X86AddressMode &AM) { + // First constrain the index register. It needs to be a GR64_NOSP. + AM.IndexReg = constrainOperandRegClass(MIB->getDesc(), AM.IndexReg, + MIB->getNumOperands() + + X86::AddrIndexReg); + return ::addFullAddress(MIB, AM); +} + +/// \brief Check if it is possible to fold the condition from the XALU intrinsic +/// into the user. The condition code will only be updated on success. +bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, + const Value *Cond) { + if (!isa<ExtractValueInst>(Cond)) + return false; + + const auto *EV = cast<ExtractValueInst>(Cond); + if (!isa<IntrinsicInst>(EV->getAggregateOperand())) + return false; + + const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand()); + MVT RetVT; + const Function *Callee = II->getCalledFunction(); + Type *RetTy = + cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U); + if (!isTypeLegal(RetTy, RetVT)) + return false; + + if (RetVT != MVT::i32 && RetVT != MVT::i64) + return false; + + X86::CondCode TmpCC; + switch (II->getIntrinsicID()) { + default: return false; + case Intrinsic::sadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break; + case Intrinsic::uadd_with_overflow: + case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break; + } + + // Check if both instructions are in the same basic block. + if (II->getParent() != I->getParent()) + return false; + + // Make sure nothing is in the way + BasicBlock::const_iterator Start(I); + BasicBlock::const_iterator End(II); + for (auto Itr = std::prev(Start); Itr != End; --Itr) { + // We only expect extractvalue instructions between the intrinsic and the + // instruction to be selected. + if (!isa<ExtractValueInst>(Itr)) + return false; + + // Check that the extractvalue operand comes from the intrinsic. + const auto *EVI = cast<ExtractValueInst>(Itr); + if (EVI->getAggregateOperand() != II) + return false; + } + + CC = TmpCC; + return true; +} + +bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { + EVT evt = TLI.getValueType(DL, Ty, /*HandleUnknown=*/true); + if (evt == MVT::Other || !evt.isSimple()) + // Unhandled type. Halt "fast" selection and bail. + return false; + + VT = evt.getSimpleVT(); + // For now, require SSE/SSE2 for performing floating-point operations, + // since x87 requires additional work. + if (VT == MVT::f64 && !X86ScalarSSEf64) + return false; + if (VT == MVT::f32 && !X86ScalarSSEf32) + return false; + // Similarly, no f80 support yet. + if (VT == MVT::f80) + return false; + // We only handle legal types. For example, on x86-32 the instruction + // selector contains all of the 64-bit instructions from x86-64, + // under the assumption that i64 won't be used if the target doesn't + // support it. + return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT); +} + +#include "X86GenCallingConv.inc" + +/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT. +/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV. +/// Return true and the result register by reference if it is possible. +bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, + MachineMemOperand *MMO, unsigned &ResultReg, + unsigned Alignment) { + // Get opcode and regclass of the output for the given load instruction. + unsigned Opc = 0; + const TargetRegisterClass *RC = nullptr; + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + Opc = X86::MOV8rm; + RC = &X86::GR8RegClass; + break; + case MVT::i16: + Opc = X86::MOV16rm; + RC = &X86::GR16RegClass; + break; + case MVT::i32: + Opc = X86::MOV32rm; + RC = &X86::GR32RegClass; + break; + case MVT::i64: + // Must be in x86-64 mode. + Opc = X86::MOV64rm; + RC = &X86::GR64RegClass; + break; + case MVT::f32: + if (X86ScalarSSEf32) { + Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; + RC = &X86::FR32RegClass; + } else { + Opc = X86::LD_Fp32m; + RC = &X86::RFP32RegClass; + } + break; + case MVT::f64: + if (X86ScalarSSEf64) { + Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm; + RC = &X86::FR64RegClass; + } else { + Opc = X86::LD_Fp64m; + RC = &X86::RFP64RegClass; + } + break; + case MVT::f80: + // No f80 support yet. + return false; + case MVT::v4f32: + if (Alignment >= 16) + Opc = Subtarget->hasAVX() ? X86::VMOVAPSrm : X86::MOVAPSrm; + else + Opc = Subtarget->hasAVX() ? X86::VMOVUPSrm : X86::MOVUPSrm; + RC = &X86::VR128RegClass; + break; + case MVT::v2f64: + if (Alignment >= 16) + Opc = Subtarget->hasAVX() ? X86::VMOVAPDrm : X86::MOVAPDrm; + else + Opc = Subtarget->hasAVX() ? X86::VMOVUPDrm : X86::MOVUPDrm; + RC = &X86::VR128RegClass; + break; + case MVT::v4i32: + case MVT::v2i64: + case MVT::v8i16: + case MVT::v16i8: + if (Alignment >= 16) + Opc = Subtarget->hasAVX() ? X86::VMOVDQArm : X86::MOVDQArm; + else + Opc = Subtarget->hasAVX() ? X86::VMOVDQUrm : X86::MOVDQUrm; + RC = &X86::VR128RegClass; + break; + } + + ResultReg = createResultReg(RC); + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); + addFullAddress(MIB, AM); + if (MMO) + MIB->addMemOperand(*FuncInfo.MF, MMO); + return true; +} + +/// X86FastEmitStore - Emit a machine instruction to store a value Val of +/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr +/// and a displacement offset, or a GlobalAddress, +/// i.e. V. Return true if it is possible. +bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, + X86AddressMode &AM, + MachineMemOperand *MMO, bool Aligned) { + bool HasSSE2 = Subtarget->hasSSE2(); + bool HasSSE4A = Subtarget->hasSSE4A(); + bool HasAVX = Subtarget->hasAVX(); + bool IsNonTemporal = MMO && MMO->isNonTemporal(); + + // Get opcode and regclass of the output for the given store instruction. + unsigned Opc = 0; + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f80: // No f80 support yet. + default: return false; + case MVT::i1: { + // Mask out all but lowest bit. + unsigned AndResult = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(X86::AND8ri), AndResult) + .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1); + ValReg = AndResult; + } + // FALLTHROUGH, handling i1 as i8. + case MVT::i8: Opc = X86::MOV8mr; break; + case MVT::i16: Opc = X86::MOV16mr; break; + case MVT::i32: + Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTImr : X86::MOV32mr; + break; + case MVT::i64: + // Must be in x86-64 mode. + Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr; + break; + case MVT::f32: + if (X86ScalarSSEf32) { + if (IsNonTemporal && HasSSE4A) + Opc = X86::MOVNTSS; + else + Opc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr; + } else + Opc = X86::ST_Fp32m; + break; + case MVT::f64: + if (X86ScalarSSEf32) { + if (IsNonTemporal && HasSSE4A) + Opc = X86::MOVNTSD; + else + Opc = HasAVX ? X86::VMOVSDmr : X86::MOVSDmr; + } else + Opc = X86::ST_Fp64m; + break; + case MVT::v4f32: + if (Aligned) { + if (IsNonTemporal) + Opc = HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr; + else + Opc = HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr; + } else + Opc = HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr; + break; + case MVT::v2f64: + if (Aligned) { + if (IsNonTemporal) + Opc = HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr; + else + Opc = HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr; + } else + Opc = HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr; + break; + case MVT::v4i32: + case MVT::v2i64: + case MVT::v8i16: + case MVT::v16i8: + if (Aligned) { + if (IsNonTemporal) + Opc = HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr; + else + Opc = HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr; + } else + Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr; + break; + } + + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)); + addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill)); + if (MMO) + MIB->addMemOperand(*FuncInfo.MF, MMO); + + return true; +} + +bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, + X86AddressMode &AM, + MachineMemOperand *MMO, bool Aligned) { + // Handle 'null' like i32/i64 0. + if (isa<ConstantPointerNull>(Val)) + Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext())); + + // If this is a store of a simple constant, fold the constant into the store. + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) { + unsigned Opc = 0; + bool Signed = true; + switch (VT.getSimpleVT().SimpleTy) { + default: break; + case MVT::i1: Signed = false; // FALLTHROUGH to handle as i8. + case MVT::i8: Opc = X86::MOV8mi; break; + case MVT::i16: Opc = X86::MOV16mi; break; + case MVT::i32: Opc = X86::MOV32mi; break; + case MVT::i64: + // Must be a 32-bit sign extended value. + if (isInt<32>(CI->getSExtValue())) + Opc = X86::MOV64mi32; + break; + } + + if (Opc) { + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)); + addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue() + : CI->getZExtValue()); + if (MMO) + MIB->addMemOperand(*FuncInfo.MF, MMO); + return true; + } + } + + unsigned ValReg = getRegForValue(Val); + if (ValReg == 0) + return false; + + bool ValKill = hasTrivialKill(Val); + return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned); +} + +/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of +/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g. +/// ISD::SIGN_EXTEND). +bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, + unsigned Src, EVT SrcVT, + unsigned &ResultReg) { + unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, + Src, /*TODO: Kill=*/false); + if (RR == 0) + return false; + + ResultReg = RR; + return true; +} + +bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { + // Handle constant address. + if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) { + // Can't handle alternate code models yet. + if (TM.getCodeModel() != CodeModel::Small) + return false; + + // Can't handle TLS yet. + if (GV->isThreadLocal()) + return false; + + // RIP-relative addresses can't have additional register operands, so if + // we've already folded stuff into the addressing mode, just force the + // global value into its own register, which we can use as the basereg. + if (!Subtarget->isPICStyleRIPRel() || + (AM.Base.Reg == 0 && AM.IndexReg == 0)) { + // Okay, we've committed to selecting this global. Set up the address. + AM.GV = GV; + + // Allow the subtarget to classify the global. + unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM); + + // If this reference is relative to the pic base, set it now. + if (isGlobalRelativeToPICBase(GVFlags)) { + // FIXME: How do we know Base.Reg is free?? + AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + } + + // Unless the ABI requires an extra load, return a direct reference to + // the global. + if (!isGlobalStubReference(GVFlags)) { + if (Subtarget->isPICStyleRIPRel()) { + // Use rip-relative addressing if we can. Above we verified that the + // base and index registers are unused. + assert(AM.Base.Reg == 0 && AM.IndexReg == 0); + AM.Base.Reg = X86::RIP; + } + AM.GVOpFlags = GVFlags; + return true; + } + + // Ok, we need to do a load from a stub. If we've already loaded from + // this stub, reuse the loaded pointer, otherwise emit the load now. + DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V); + unsigned LoadReg; + if (I != LocalValueMap.end() && I->second != 0) { + LoadReg = I->second; + } else { + // Issue load from stub. + unsigned Opc = 0; + const TargetRegisterClass *RC = nullptr; + X86AddressMode StubAM; + StubAM.Base.Reg = AM.Base.Reg; + StubAM.GV = GV; + StubAM.GVOpFlags = GVFlags; + + // Prepare for inserting code in the local-value area. + SavePoint SaveInsertPt = enterLocalValueArea(); + + if (TLI.getPointerTy(DL) == MVT::i64) { + Opc = X86::MOV64rm; + RC = &X86::GR64RegClass; + + if (Subtarget->isPICStyleRIPRel()) + StubAM.Base.Reg = X86::RIP; + } else { + Opc = X86::MOV32rm; + RC = &X86::GR32RegClass; + } + + LoadReg = createResultReg(RC); + MachineInstrBuilder LoadMI = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg); + addFullAddress(LoadMI, StubAM); + + // Ok, back to normal mode. + leaveLocalValueArea(SaveInsertPt); + + // Prevent loading GV stub multiple times in same MBB. + LocalValueMap[V] = LoadReg; + } + + // Now construct the final address. Note that the Disp, Scale, + // and Index values may already be set here. + AM.Base.Reg = LoadReg; + AM.GV = nullptr; + return true; + } + } + + // If all else fails, try to materialize the value in a register. + if (!AM.GV || !Subtarget->isPICStyleRIPRel()) { + if (AM.Base.Reg == 0) { + AM.Base.Reg = getRegForValue(V); + return AM.Base.Reg != 0; + } + if (AM.IndexReg == 0) { + assert(AM.Scale == 1 && "Scale with no index!"); + AM.IndexReg = getRegForValue(V); + return AM.IndexReg != 0; + } + } + + return false; +} + +/// X86SelectAddress - Attempt to fill in an address from the given value. +/// +bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { + SmallVector<const Value *, 32> GEPs; +redo_gep: + const User *U = nullptr; + unsigned Opcode = Instruction::UserOp1; + if (const Instruction *I = dyn_cast<Instruction>(V)) { + // Don't walk into other basic blocks; it's possible we haven't + // visited them yet, so the instructions may not yet be assigned + // virtual registers. + if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) || + FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) { + Opcode = I->getOpcode(); + U = I; + } + } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) { + Opcode = C->getOpcode(); + U = C; + } + + if (PointerType *Ty = dyn_cast<PointerType>(V->getType())) + if (Ty->getAddressSpace() > 255) + // Fast instruction selection doesn't support the special + // address spaces. + return false; + + switch (Opcode) { + default: break; + case Instruction::BitCast: + // Look past bitcasts. + return X86SelectAddress(U->getOperand(0), AM); + + case Instruction::IntToPtr: + // Look past no-op inttoptrs. + if (TLI.getValueType(DL, U->getOperand(0)->getType()) == + TLI.getPointerTy(DL)) + return X86SelectAddress(U->getOperand(0), AM); + break; + + case Instruction::PtrToInt: + // Look past no-op ptrtoints. + if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL)) + return X86SelectAddress(U->getOperand(0), AM); + break; + + case Instruction::Alloca: { + // Do static allocas. + const AllocaInst *A = cast<AllocaInst>(V); + DenseMap<const AllocaInst *, int>::iterator SI = + FuncInfo.StaticAllocaMap.find(A); + if (SI != FuncInfo.StaticAllocaMap.end()) { + AM.BaseType = X86AddressMode::FrameIndexBase; + AM.Base.FrameIndex = SI->second; + return true; + } + break; + } + + case Instruction::Add: { + // Adds of constants are common and easy enough. + if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) { + uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue(); + // They have to fit in the 32-bit signed displacement field though. + if (isInt<32>(Disp)) { + AM.Disp = (uint32_t)Disp; + return X86SelectAddress(U->getOperand(0), AM); + } + } + break; + } + + case Instruction::GetElementPtr: { + X86AddressMode SavedAM = AM; + + // Pattern-match simple GEPs. + uint64_t Disp = (int32_t)AM.Disp; + unsigned IndexReg = AM.IndexReg; + unsigned Scale = AM.Scale; + gep_type_iterator GTI = gep_type_begin(U); + // Iterate through the indices, folding what we can. Constants can be + // folded, and one dynamic index can be handled, if the scale is supported. + for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); + i != e; ++i, ++GTI) { + const Value *Op = *i; + if (StructType *STy = dyn_cast<StructType>(*GTI)) { + const StructLayout *SL = DL.getStructLayout(STy); + Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue()); + continue; + } + + // A array/variable index is always of the form i*S where S is the + // constant scale size. See if we can push the scale into immediates. + uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); + for (;;) { + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { + // Constant-offset addressing. + Disp += CI->getSExtValue() * S; + break; + } + if (canFoldAddIntoGEP(U, Op)) { + // A compatible add with a constant operand. Fold the constant. + ConstantInt *CI = + cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); + Disp += CI->getSExtValue() * S; + // Iterate on the other operand. + Op = cast<AddOperator>(Op)->getOperand(0); + continue; + } + if (IndexReg == 0 && + (!AM.GV || !Subtarget->isPICStyleRIPRel()) && + (S == 1 || S == 2 || S == 4 || S == 8)) { + // Scaled-index addressing. + Scale = S; + IndexReg = getRegForGEPIndex(Op).first; + if (IndexReg == 0) + return false; + break; + } + // Unsupported. + goto unsupported_gep; + } + } + + // Check for displacement overflow. + if (!isInt<32>(Disp)) + break; + + AM.IndexReg = IndexReg; + AM.Scale = Scale; + AM.Disp = (uint32_t)Disp; + GEPs.push_back(V); + + if (const GetElementPtrInst *GEP = + dyn_cast<GetElementPtrInst>(U->getOperand(0))) { + // Ok, the GEP indices were covered by constant-offset and scaled-index + // addressing. Update the address state and move on to examining the base. + V = GEP; + goto redo_gep; + } else if (X86SelectAddress(U->getOperand(0), AM)) { + return true; + } + + // If we couldn't merge the gep value into this addr mode, revert back to + // our address and just match the value instead of completely failing. + AM = SavedAM; + + for (SmallVectorImpl<const Value *>::reverse_iterator + I = GEPs.rbegin(), E = GEPs.rend(); I != E; ++I) + if (handleConstantAddresses(*I, AM)) + return true; + + return false; + unsupported_gep: + // Ok, the GEP indices weren't all covered. + break; + } + } + + return handleConstantAddresses(V, AM); +} + +/// X86SelectCallAddress - Attempt to fill in an address from the given value. +/// +bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { + const User *U = nullptr; + unsigned Opcode = Instruction::UserOp1; + const Instruction *I = dyn_cast<Instruction>(V); + // Record if the value is defined in the same basic block. + // + // This information is crucial to know whether or not folding an + // operand is valid. + // Indeed, FastISel generates or reuses a virtual register for all + // operands of all instructions it selects. Obviously, the definition and + // its uses must use the same virtual register otherwise the produced + // code is incorrect. + // Before instruction selection, FunctionLoweringInfo::set sets the virtual + // registers for values that are alive across basic blocks. This ensures + // that the values are consistently set between across basic block, even + // if different instruction selection mechanisms are used (e.g., a mix of + // SDISel and FastISel). + // For values local to a basic block, the instruction selection process + // generates these virtual registers with whatever method is appropriate + // for its needs. In particular, FastISel and SDISel do not share the way + // local virtual registers are set. + // Therefore, this is impossible (or at least unsafe) to share values + // between basic blocks unless they use the same instruction selection + // method, which is not guarantee for X86. + // Moreover, things like hasOneUse could not be used accurately, if we + // allow to reference values across basic blocks whereas they are not + // alive across basic blocks initially. + bool InMBB = true; + if (I) { + Opcode = I->getOpcode(); + U = I; + InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock(); + } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) { + Opcode = C->getOpcode(); + U = C; + } + + switch (Opcode) { + default: break; + case Instruction::BitCast: + // Look past bitcasts if its operand is in the same BB. + if (InMBB) + return X86SelectCallAddress(U->getOperand(0), AM); + break; + + case Instruction::IntToPtr: + // Look past no-op inttoptrs if its operand is in the same BB. + if (InMBB && + TLI.getValueType(DL, U->getOperand(0)->getType()) == + TLI.getPointerTy(DL)) + return X86SelectCallAddress(U->getOperand(0), AM); + break; + + case Instruction::PtrToInt: + // Look past no-op ptrtoints if its operand is in the same BB. + if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL)) + return X86SelectCallAddress(U->getOperand(0), AM); + break; + } + + // Handle constant address. + if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) { + // Can't handle alternate code models yet. + if (TM.getCodeModel() != CodeModel::Small) + return false; + + // RIP-relative addresses can't have additional register operands. + if (Subtarget->isPICStyleRIPRel() && + (AM.Base.Reg != 0 || AM.IndexReg != 0)) + return false; + + // Can't handle DLL Import. + if (GV->hasDLLImportStorageClass()) + return false; + + // Can't handle TLS. + if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) + if (GVar->isThreadLocal()) + return false; + + // Okay, we've committed to selecting this global. Set up the basic address. + AM.GV = GV; + + // No ABI requires an extra load for anything other than DLLImport, which + // we rejected above. Return a direct reference to the global. + if (Subtarget->isPICStyleRIPRel()) { + // Use rip-relative addressing if we can. Above we verified that the + // base and index registers are unused. + assert(AM.Base.Reg == 0 && AM.IndexReg == 0); + AM.Base.Reg = X86::RIP; + } else if (Subtarget->isPICStyleStubPIC()) { + AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET; + } else if (Subtarget->isPICStyleGOT()) { + AM.GVOpFlags = X86II::MO_GOTOFF; + } + + return true; + } + + // If all else fails, try to materialize the value in a register. + if (!AM.GV || !Subtarget->isPICStyleRIPRel()) { + if (AM.Base.Reg == 0) { + AM.Base.Reg = getRegForValue(V); + return AM.Base.Reg != 0; + } + if (AM.IndexReg == 0) { + assert(AM.Scale == 1 && "Scale with no index!"); + AM.IndexReg = getRegForValue(V); + return AM.IndexReg != 0; + } + } + + return false; +} + + +/// X86SelectStore - Select and emit code to implement store instructions. +bool X86FastISel::X86SelectStore(const Instruction *I) { + // Atomic stores need special handling. + const StoreInst *S = cast<StoreInst>(I); + + if (S->isAtomic()) + return false; + + const Value *Val = S->getValueOperand(); + const Value *Ptr = S->getPointerOperand(); + + MVT VT; + if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true)) + return false; + + unsigned Alignment = S->getAlignment(); + unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType()); + if (Alignment == 0) // Ensure that codegen never sees alignment 0 + Alignment = ABIAlignment; + bool Aligned = Alignment >= ABIAlignment; + + X86AddressMode AM; + if (!X86SelectAddress(Ptr, AM)) + return false; + + return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned); +} + +/// X86SelectRet - Select and emit code to implement ret instructions. +bool X86FastISel::X86SelectRet(const Instruction *I) { + const ReturnInst *Ret = cast<ReturnInst>(I); + const Function &F = *I->getParent()->getParent(); + const X86MachineFunctionInfo *X86MFInfo = + FuncInfo.MF->getInfo<X86MachineFunctionInfo>(); + + if (!FuncInfo.CanLowerReturn) + return false; + + CallingConv::ID CC = F.getCallingConv(); + if (CC != CallingConv::C && + CC != CallingConv::Fast && + CC != CallingConv::X86_FastCall && + CC != CallingConv::X86_64_SysV) + return false; + + if (Subtarget->isCallingConvWin64(CC)) + return false; + + // Don't handle popping bytes on return for now. + if (X86MFInfo->getBytesToPopOnReturn() != 0) + return false; + + // fastcc with -tailcallopt is intended to provide a guaranteed + // tail call optimization. Fastisel doesn't know how to do that. + if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) + return false; + + // Let SDISel handle vararg functions. + if (F.isVarArg()) + return false; + + // Build a list of return value registers. + SmallVector<unsigned, 4> RetRegs; + + if (Ret->getNumOperands() > 0) { + SmallVector<ISD::OutputArg, 4> Outs; + GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ValLocs; + CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext()); + CCInfo.AnalyzeReturn(Outs, RetCC_X86); + + const Value *RV = Ret->getOperand(0); + unsigned Reg = getRegForValue(RV); + if (Reg == 0) + return false; + + // Only handle a single return value for now. + if (ValLocs.size() != 1) + return false; + + CCValAssign &VA = ValLocs[0]; + + // Don't bother handling odd stuff for now. + if (VA.getLocInfo() != CCValAssign::Full) + return false; + // Only handle register returns for now. + if (!VA.isRegLoc()) + return false; + + // The calling-convention tables for x87 returns don't tell + // the whole story. + if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) + return false; + + unsigned SrcReg = Reg + VA.getValNo(); + EVT SrcVT = TLI.getValueType(DL, RV->getType()); + EVT DstVT = VA.getValVT(); + // Special handling for extended integers. + if (SrcVT != DstVT) { + if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16) + return false; + + if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt()) + return false; + + assert(DstVT == MVT::i32 && "X86 should always ext to i32"); + + if (SrcVT == MVT::i1) { + if (Outs[0].Flags.isSExt()) + return false; + SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false); + SrcVT = MVT::i8; + } + unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND : + ISD::SIGN_EXTEND; + SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, + SrcReg, /*TODO: Kill=*/false); + } + + // Make the copy. + unsigned DstReg = VA.getLocReg(); + const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); + // Avoid a cross-class copy. This is very unlikely. + if (!SrcRC->contains(DstReg)) + return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg); + + // Add register to return instruction. + RetRegs.push_back(VA.getLocReg()); + } + + // All x86 ABIs require that for returning structs by value we copy + // the sret argument into %rax/%eax (depending on ABI) for the return. + // We saved the argument into a virtual register in the entry block, + // so now we copy the value out and into %rax/%eax. + if (F.hasStructRetAttr()) { + unsigned Reg = X86MFInfo->getSRetReturnReg(); + assert(Reg && + "SRetReturnReg should have been set in LowerFormalArguments()!"); + unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), RetReg).addReg(Reg); + RetRegs.push_back(RetReg); + } + + // Now emit the RET. + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL)); + for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) + MIB.addReg(RetRegs[i], RegState::Implicit); + return true; +} + +/// X86SelectLoad - Select and emit code to implement load instructions. +/// +bool X86FastISel::X86SelectLoad(const Instruction *I) { + const LoadInst *LI = cast<LoadInst>(I); + + // Atomic loads need special handling. + if (LI->isAtomic()) + return false; + + MVT VT; + if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true)) + return false; + + const Value *Ptr = LI->getPointerOperand(); + + X86AddressMode AM; + if (!X86SelectAddress(Ptr, AM)) + return false; + + unsigned Alignment = LI->getAlignment(); + unsigned ABIAlignment = DL.getABITypeAlignment(LI->getType()); + if (Alignment == 0) // Ensure that codegen never sees alignment 0 + Alignment = ABIAlignment; + + unsigned ResultReg = 0; + if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg, + Alignment)) + return false; + + updateValueMap(I, ResultReg); + return true; +} + +static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { + bool HasAVX = Subtarget->hasAVX(); + bool X86ScalarSSEf32 = Subtarget->hasSSE1(); + bool X86ScalarSSEf64 = Subtarget->hasSSE2(); + + switch (VT.getSimpleVT().SimpleTy) { + default: return 0; + case MVT::i8: return X86::CMP8rr; + case MVT::i16: return X86::CMP16rr; + case MVT::i32: return X86::CMP32rr; + case MVT::i64: return X86::CMP64rr; + case MVT::f32: + return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0; + case MVT::f64: + return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0; + } +} + +/// If we have a comparison with RHS as the RHS of the comparison, return an +/// opcode that works for the compare (e.g. CMP32ri) otherwise return 0. +static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) { + int64_t Val = RHSC->getSExtValue(); + switch (VT.getSimpleVT().SimpleTy) { + // Otherwise, we can't fold the immediate into this comparison. + default: + return 0; + case MVT::i8: + return X86::CMP8ri; + case MVT::i16: + if (isInt<8>(Val)) + return X86::CMP16ri8; + return X86::CMP16ri; + case MVT::i32: + if (isInt<8>(Val)) + return X86::CMP32ri8; + return X86::CMP32ri; + case MVT::i64: + if (isInt<8>(Val)) + return X86::CMP64ri8; + // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext + // field. + if (isInt<32>(Val)) + return X86::CMP64ri32; + return 0; + } +} + +bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, + EVT VT, DebugLoc CurDbgLoc) { + unsigned Op0Reg = getRegForValue(Op0); + if (Op0Reg == 0) return false; + + // Handle 'null' like i32/i64 0. + if (isa<ConstantPointerNull>(Op1)) + Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext())); + + // We have two options: compare with register or immediate. If the RHS of + // the compare is an immediate that we can fold into this compare, use + // CMPri, otherwise use CMPrr. + if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) { + if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareImmOpc)) + .addReg(Op0Reg) + .addImm(Op1C->getSExtValue()); + return true; + } + } + + unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget); + if (CompareOpc == 0) return false; + + unsigned Op1Reg = getRegForValue(Op1); + if (Op1Reg == 0) return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc)) + .addReg(Op0Reg) + .addReg(Op1Reg); + + return true; +} + +bool X86FastISel::X86SelectCmp(const Instruction *I) { + const CmpInst *CI = cast<CmpInst>(I); + + MVT VT; + if (!isTypeLegal(I->getOperand(0)->getType(), VT)) + return false; + + // Try to optimize or fold the cmp. + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + unsigned ResultReg = 0; + switch (Predicate) { + default: break; + case CmpInst::FCMP_FALSE: { + ResultReg = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0), + ResultReg); + ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true, + X86::sub_8bit); + if (!ResultReg) + return false; + break; + } + case CmpInst::FCMP_TRUE: { + ResultReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri), + ResultReg).addImm(1); + break; + } + } + + if (ResultReg) { + updateValueMap(I, ResultReg); + return true; + } + + const Value *LHS = CI->getOperand(0); + const Value *RHS = CI->getOperand(1); + + // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0. + // We don't have to materialize a zero constant for this case and can just use + // %x again on the RHS. + if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { + const auto *RHSC = dyn_cast<ConstantFP>(RHS); + if (RHSC && RHSC->isNullValue()) + RHS = LHS; + } + + // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. + static unsigned SETFOpcTable[2][3] = { + { X86::SETEr, X86::SETNPr, X86::AND8rr }, + { X86::SETNEr, X86::SETPr, X86::OR8rr } + }; + unsigned *SETFOpc = nullptr; + switch (Predicate) { + default: break; + case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break; + case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break; + } + + ResultReg = createResultReg(&X86::GR8RegClass); + if (SETFOpc) { + if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc())) + return false; + + unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); + unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), + FlagReg1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), + FlagReg2); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]), + ResultReg).addReg(FlagReg1).addReg(FlagReg2); + updateValueMap(I, ResultReg); + return true; + } + + X86::CondCode CC; + bool SwapArgs; + std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); + assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); + unsigned Opc = X86::getSETFromCond(CC); + + if (SwapArgs) + std::swap(LHS, RHS); + + // Emit a compare of LHS/RHS. + if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc())) + return false; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); + updateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86SelectZExt(const Instruction *I) { + EVT DstVT = TLI.getValueType(DL, I->getType()); + if (!TLI.isTypeLegal(DstVT)) + return false; + + unsigned ResultReg = getRegForValue(I->getOperand(0)); + if (ResultReg == 0) + return false; + + // Handle zero-extension from i1 to i8, which is common. + MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType()); + if (SrcVT.SimpleTy == MVT::i1) { + // Set the high bits to zero. + ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); + SrcVT = MVT::i8; + + if (ResultReg == 0) + return false; + } + + if (DstVT == MVT::i64) { + // Handle extension to 64-bits via sub-register shenanigans. + unsigned MovInst; + + switch (SrcVT.SimpleTy) { + case MVT::i8: MovInst = X86::MOVZX32rr8; break; + case MVT::i16: MovInst = X86::MOVZX32rr16; break; + case MVT::i32: MovInst = X86::MOV32rr; break; + default: llvm_unreachable("Unexpected zext to i64 source type"); + } + + unsigned Result32 = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32) + .addReg(ResultReg); + + ResultReg = createResultReg(&X86::GR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG), + ResultReg) + .addImm(0).addReg(Result32).addImm(X86::sub_32bit); + } else if (DstVT != MVT::i8) { + ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND, + ResultReg, /*Kill=*/true); + if (ResultReg == 0) + return false; + } + + updateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86SelectBranch(const Instruction *I) { + // Unconditional branches are selected by tablegen-generated code. + // Handle a conditional branch. + const BranchInst *BI = cast<BranchInst>(I); + MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; + MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; + + // Fold the common case of a conditional branch with a comparison + // in the same block (values defined on other blocks may not have + // initialized registers). + X86::CondCode CC; + if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { + if (CI->hasOneUse() && CI->getParent() == I->getParent()) { + EVT VT = TLI.getValueType(DL, CI->getOperand(0)->getType()); + + // Try to optimize or fold the cmp. + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + switch (Predicate) { + default: break; + case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true; + case CmpInst::FCMP_TRUE: fastEmitBranch(TrueMBB, DbgLoc); return true; + } + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + + // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, + // 0.0. + // We don't have to materialize a zero constant for this case and can just + // use %x again on the RHS. + if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { + const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS); + if (CmpRHSC && CmpRHSC->isNullValue()) + CmpRHS = CmpLHS; + } + + // Try to take advantage of fallthrough opportunities. + if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { + std::swap(TrueMBB, FalseMBB); + Predicate = CmpInst::getInversePredicate(Predicate); + } + + // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition + // code check. Instead two branch instructions are required to check all + // the flags. First we change the predicate to a supported condition code, + // which will be the first branch. Later one we will emit the second + // branch. + bool NeedExtraBranch = false; + switch (Predicate) { + default: break; + case CmpInst::FCMP_OEQ: + std::swap(TrueMBB, FalseMBB); // fall-through + case CmpInst::FCMP_UNE: + NeedExtraBranch = true; + Predicate = CmpInst::FCMP_ONE; + break; + } + + bool SwapArgs; + unsigned BranchOpc; + std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); + assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); + + BranchOpc = X86::GetCondBranchFromCond(CC); + if (SwapArgs) + std::swap(CmpLHS, CmpRHS); + + // Emit a compare of the LHS and RHS, setting the flags. + if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc())) + return false; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) + .addMBB(TrueMBB); + + // X86 requires a second branch to handle UNE (and OEQ, which is mapped + // to UNE above). + if (NeedExtraBranch) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1)) + .addMBB(TrueMBB); + } + + finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); + return true; + } + } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { + // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which + // typically happen for _Bool and C++ bools. + MVT SourceVT; + if (TI->hasOneUse() && TI->getParent() == I->getParent() && + isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) { + unsigned TestOpc = 0; + switch (SourceVT.SimpleTy) { + default: break; + case MVT::i8: TestOpc = X86::TEST8ri; break; + case MVT::i16: TestOpc = X86::TEST16ri; break; + case MVT::i32: TestOpc = X86::TEST32ri; break; + case MVT::i64: TestOpc = X86::TEST64ri32; break; + } + if (TestOpc) { + unsigned OpReg = getRegForValue(TI->getOperand(0)); + if (OpReg == 0) return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc)) + .addReg(OpReg).addImm(1); + + unsigned JmpOpc = X86::JNE_1; + if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { + std::swap(TrueMBB, FalseMBB); + JmpOpc = X86::JE_1; + } + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc)) + .addMBB(TrueMBB); + + finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); + return true; + } + } + } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) { + // Fake request the condition, otherwise the intrinsic might be completely + // optimized away. + unsigned TmpReg = getRegForValue(BI->getCondition()); + if (TmpReg == 0) + return false; + + unsigned BranchOpc = X86::GetCondBranchFromCond(CC); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) + .addMBB(TrueMBB); + finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); + return true; + } + + // Otherwise do a clumsy setcc and re-test it. + // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used + // in an explicit cast, so make sure to handle that correctly. + unsigned OpReg = getRegForValue(BI->getCondition()); + if (OpReg == 0) return false; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) + .addReg(OpReg).addImm(1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1)) + .addMBB(TrueMBB); + finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); + return true; +} + +bool X86FastISel::X86SelectShift(const Instruction *I) { + unsigned CReg = 0, OpReg = 0; + const TargetRegisterClass *RC = nullptr; + if (I->getType()->isIntegerTy(8)) { + CReg = X86::CL; + RC = &X86::GR8RegClass; + switch (I->getOpcode()) { + case Instruction::LShr: OpReg = X86::SHR8rCL; break; + case Instruction::AShr: OpReg = X86::SAR8rCL; break; + case Instruction::Shl: OpReg = X86::SHL8rCL; break; + default: return false; + } + } else if (I->getType()->isIntegerTy(16)) { + CReg = X86::CX; + RC = &X86::GR16RegClass; + switch (I->getOpcode()) { + case Instruction::LShr: OpReg = X86::SHR16rCL; break; + case Instruction::AShr: OpReg = X86::SAR16rCL; break; + case Instruction::Shl: OpReg = X86::SHL16rCL; break; + default: return false; + } + } else if (I->getType()->isIntegerTy(32)) { + CReg = X86::ECX; + RC = &X86::GR32RegClass; + switch (I->getOpcode()) { + case Instruction::LShr: OpReg = X86::SHR32rCL; break; + case Instruction::AShr: OpReg = X86::SAR32rCL; break; + case Instruction::Shl: OpReg = X86::SHL32rCL; break; + default: return false; + } + } else if (I->getType()->isIntegerTy(64)) { + CReg = X86::RCX; + RC = &X86::GR64RegClass; + switch (I->getOpcode()) { + case Instruction::LShr: OpReg = X86::SHR64rCL; break; + case Instruction::AShr: OpReg = X86::SAR64rCL; break; + case Instruction::Shl: OpReg = X86::SHL64rCL; break; + default: return false; + } + } else { + return false; + } + + MVT VT; + if (!isTypeLegal(I->getType(), VT)) + return false; + + unsigned Op0Reg = getRegForValue(I->getOperand(0)); + if (Op0Reg == 0) return false; + + unsigned Op1Reg = getRegForValue(I->getOperand(1)); + if (Op1Reg == 0) return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), + CReg).addReg(Op1Reg); + + // The shift instruction uses X86::CL. If we defined a super-register + // of X86::CL, emit a subreg KILL to precisely describe what we're doing here. + if (CReg != X86::CL) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::KILL), X86::CL) + .addReg(CReg, RegState::Kill); + + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg) + .addReg(Op0Reg); + updateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86SelectDivRem(const Instruction *I) { + const static unsigned NumTypes = 4; // i8, i16, i32, i64 + const static unsigned NumOps = 4; // SDiv, SRem, UDiv, URem + const static bool S = true; // IsSigned + const static bool U = false; // !IsSigned + const static unsigned Copy = TargetOpcode::COPY; + // For the X86 DIV/IDIV instruction, in most cases the dividend + // (numerator) must be in a specific register pair highreg:lowreg, + // producing the quotient in lowreg and the remainder in highreg. + // For most data types, to set up the instruction, the dividend is + // copied into lowreg, and lowreg is sign-extended or zero-extended + // into highreg. The exception is i8, where the dividend is defined + // as a single register rather than a register pair, and we + // therefore directly sign-extend or zero-extend the dividend into + // lowreg, instead of copying, and ignore the highreg. + const static struct DivRemEntry { + // The following portion depends only on the data type. + const TargetRegisterClass *RC; + unsigned LowInReg; // low part of the register pair + unsigned HighInReg; // high part of the register pair + // The following portion depends on both the data type and the operation. + struct DivRemResult { + unsigned OpDivRem; // The specific DIV/IDIV opcode to use. + unsigned OpSignExtend; // Opcode for sign-extending lowreg into + // highreg, or copying a zero into highreg. + unsigned OpCopy; // Opcode for copying dividend into lowreg, or + // zero/sign-extending into lowreg for i8. + unsigned DivRemResultReg; // Register containing the desired result. + bool IsOpSigned; // Whether to use signed or unsigned form. + } ResultTable[NumOps]; + } OpTable[NumTypes] = { + { &X86::GR8RegClass, X86::AX, 0, { + { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AL, S }, // SDiv + { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S }, // SRem + { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U }, // UDiv + { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U }, // URem + } + }, // i8 + { &X86::GR16RegClass, X86::AX, X86::DX, { + { X86::IDIV16r, X86::CWD, Copy, X86::AX, S }, // SDiv + { X86::IDIV16r, X86::CWD, Copy, X86::DX, S }, // SRem + { X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U }, // UDiv + { X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U }, // URem + } + }, // i16 + { &X86::GR32RegClass, X86::EAX, X86::EDX, { + { X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S }, // SDiv + { X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S }, // SRem + { X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U }, // UDiv + { X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U }, // URem + } + }, // i32 + { &X86::GR64RegClass, X86::RAX, X86::RDX, { + { X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv + { X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem + { X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U }, // UDiv + { X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U }, // URem + } + }, // i64 + }; + + MVT VT; + if (!isTypeLegal(I->getType(), VT)) + return false; + + unsigned TypeIndex, OpIndex; + switch (VT.SimpleTy) { + default: return false; + case MVT::i8: TypeIndex = 0; break; + case MVT::i16: TypeIndex = 1; break; + case MVT::i32: TypeIndex = 2; break; + case MVT::i64: TypeIndex = 3; + if (!Subtarget->is64Bit()) + return false; + break; + } + + switch (I->getOpcode()) { + default: llvm_unreachable("Unexpected div/rem opcode"); + case Instruction::SDiv: OpIndex = 0; break; + case Instruction::SRem: OpIndex = 1; break; + case Instruction::UDiv: OpIndex = 2; break; + case Instruction::URem: OpIndex = 3; break; + } + + const DivRemEntry &TypeEntry = OpTable[TypeIndex]; + const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex]; + unsigned Op0Reg = getRegForValue(I->getOperand(0)); + if (Op0Reg == 0) + return false; + unsigned Op1Reg = getRegForValue(I->getOperand(1)); + if (Op1Reg == 0) + return false; + + // Move op0 into low-order input register. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg); + // Zero-extend or sign-extend into high-order input register. + if (OpEntry.OpSignExtend) { + if (OpEntry.IsOpSigned) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(OpEntry.OpSignExtend)); + else { + unsigned Zero32 = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(X86::MOV32r0), Zero32); + + // Copy the zero into the appropriate sub/super/identical physical + // register. Unfortunately the operations needed are not uniform enough + // to fit neatly into the table above. + if (VT.SimpleTy == MVT::i16) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Copy), TypeEntry.HighInReg) + .addReg(Zero32, 0, X86::sub_16bit); + } else if (VT.SimpleTy == MVT::i32) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Copy), TypeEntry.HighInReg) + .addReg(Zero32); + } else if (VT.SimpleTy == MVT::i64) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg) + .addImm(0).addReg(Zero32).addImm(X86::sub_32bit); + } + } + } + // Generate the DIV/IDIV instruction. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(OpEntry.OpDivRem)).addReg(Op1Reg); + // For i8 remainder, we can't reference AH directly, as we'll end + // up with bogus copies like %R9B = COPY %AH. Reference AX + // instead to prevent AH references in a REX instruction. + // + // The current assumption of the fast register allocator is that isel + // won't generate explicit references to the GPR8_NOREX registers. If + // the allocator and/or the backend get enhanced to be more robust in + // that regard, this can be, and should be, removed. + unsigned ResultReg = 0; + if ((I->getOpcode() == Instruction::SRem || + I->getOpcode() == Instruction::URem) && + OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) { + unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass); + unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Copy), SourceSuperReg).addReg(X86::AX); + + // Shift AX right by 8 bits instead of using AH. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri), + ResultSuperReg).addReg(SourceSuperReg).addImm(8); + + // Now reference the 8-bit subreg of the result. + ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg, + /*Kill=*/true, X86::sub_8bit); + } + // Copy the result out of the physreg if we haven't already. + if (!ResultReg) { + ResultReg = createResultReg(TypeEntry.RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg) + .addReg(OpEntry.DivRemResultReg); + } + updateValueMap(I, ResultReg); + + return true; +} + +/// \brief Emit a conditional move instruction (if the are supported) to lower +/// the select. +bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { + // Check if the subtarget supports these instructions. + if (!Subtarget->hasCMov()) + return false; + + // FIXME: Add support for i8. + if (RetVT < MVT::i16 || RetVT > MVT::i64) + return false; + + const Value *Cond = I->getOperand(0); + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + bool NeedTest = true; + X86::CondCode CC = X86::COND_NE; + + // Optimize conditions coming from a compare if both instructions are in the + // same basic block (values defined in other basic blocks may not have + // initialized registers). + const auto *CI = dyn_cast<CmpInst>(Cond); + if (CI && (CI->getParent() == I->getParent())) { + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + + // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. + static unsigned SETFOpcTable[2][3] = { + { X86::SETNPr, X86::SETEr , X86::TEST8rr }, + { X86::SETPr, X86::SETNEr, X86::OR8rr } + }; + unsigned *SETFOpc = nullptr; + switch (Predicate) { + default: break; + case CmpInst::FCMP_OEQ: + SETFOpc = &SETFOpcTable[0][0]; + Predicate = CmpInst::ICMP_NE; + break; + case CmpInst::FCMP_UNE: + SETFOpc = &SETFOpcTable[1][0]; + Predicate = CmpInst::ICMP_NE; + break; + } + + bool NeedSwap; + std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate); + assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + if (NeedSwap) + std::swap(CmpLHS, CmpRHS); + + EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType()); + // Emit a compare of the LHS and RHS, setting the flags. + if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc())) + return false; + + if (SETFOpc) { + unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); + unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), + FlagReg1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), + FlagReg2); + auto const &II = TII.get(SETFOpc[2]); + if (II.getNumDefs()) { + unsigned TmpReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg) + .addReg(FlagReg2).addReg(FlagReg1); + } else { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addReg(FlagReg2).addReg(FlagReg1); + } + } + NeedTest = false; + } else if (foldX86XALUIntrinsic(CC, I, Cond)) { + // Fake request the condition, otherwise the intrinsic might be completely + // optimized away. + unsigned TmpReg = getRegForValue(Cond); + if (TmpReg == 0) + return false; + + NeedTest = false; + } + + if (NeedTest) { + // Selects operate on i1, however, CondReg is 8 bits width and may contain + // garbage. Indeed, only the less significant bit is supposed to be + // accurate. If we read more than the lsb, we may see non-zero values + // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for + // the select. This is achieved by performing TEST against 1. + unsigned CondReg = getRegForValue(Cond); + if (CondReg == 0) + return false; + bool CondIsKill = hasTrivialKill(Cond); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) + .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); + } + + const Value *LHS = I->getOperand(1); + const Value *RHS = I->getOperand(2); + + unsigned RHSReg = getRegForValue(RHS); + bool RHSIsKill = hasTrivialKill(RHS); + + unsigned LHSReg = getRegForValue(LHS); + bool LHSIsKill = hasTrivialKill(LHS); + + if (!LHSReg || !RHSReg) + return false; + + unsigned Opc = X86::getCMovFromCond(CC, RC->getSize()); + unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill, + LHSReg, LHSIsKill); + updateValueMap(I, ResultReg); + return true; +} + +/// \brief Emit SSE or AVX instructions to lower the select. +/// +/// Try to use SSE1/SSE2 instructions to simulate a select without branches. +/// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary +/// SSE instructions are available. If AVX is available, try to use a VBLENDV. +bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { + // Optimize conditions coming from a compare if both instructions are in the + // same basic block (values defined in other basic blocks may not have + // initialized registers). + const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0)); + if (!CI || (CI->getParent() != I->getParent())) + return false; + + if (I->getType() != CI->getOperand(0)->getType() || + !((Subtarget->hasSSE1() && RetVT == MVT::f32) || + (Subtarget->hasSSE2() && RetVT == MVT::f64))) + return false; + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + + // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0. + // We don't have to materialize a zero constant for this case and can just use + // %x again on the RHS. + if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { + const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS); + if (CmpRHSC && CmpRHSC->isNullValue()) + CmpRHS = CmpLHS; + } + + unsigned CC; + bool NeedSwap; + std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate); + if (CC > 7) + return false; + + if (NeedSwap) + std::swap(CmpLHS, CmpRHS); + + // Choose the SSE instruction sequence based on data type (float or double). + static unsigned OpcTable[2][4] = { + { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr }, + { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr } + }; + + unsigned *Opc = nullptr; + switch (RetVT.SimpleTy) { + default: return false; + case MVT::f32: Opc = &OpcTable[0][0]; break; + case MVT::f64: Opc = &OpcTable[1][0]; break; + } + + const Value *LHS = I->getOperand(1); + const Value *RHS = I->getOperand(2); + + unsigned LHSReg = getRegForValue(LHS); + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned RHSReg = getRegForValue(RHS); + bool RHSIsKill = hasTrivialKill(RHS); + + unsigned CmpLHSReg = getRegForValue(CmpLHS); + bool CmpLHSIsKill = hasTrivialKill(CmpLHS); + + unsigned CmpRHSReg = getRegForValue(CmpRHS); + bool CmpRHSIsKill = hasTrivialKill(CmpRHS); + + if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS) + return false; + + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + unsigned ResultReg; + + if (Subtarget->hasAVX()) { + const TargetRegisterClass *FR32 = &X86::FR32RegClass; + const TargetRegisterClass *VR128 = &X86::VR128RegClass; + + // If we have AVX, create 1 blendv instead of 3 logic instructions. + // Blendv was introduced with SSE 4.1, but the 2 register form implicitly + // uses XMM0 as the selection register. That may need just as many + // instructions as the AND/ANDN/OR sequence due to register moves, so + // don't bother. + unsigned CmpOpcode = + (RetVT.SimpleTy == MVT::f32) ? X86::VCMPSSrr : X86::VCMPSDrr; + unsigned BlendOpcode = + (RetVT.SimpleTy == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr; + + unsigned CmpReg = fastEmitInst_rri(CmpOpcode, FR32, CmpLHSReg, CmpLHSIsKill, + CmpRHSReg, CmpRHSIsKill, CC); + unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill, + LHSReg, LHSIsKill, CmpReg, true); + ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg); + } else { + unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, + CmpRHSReg, CmpRHSIsKill, CC); + unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, + LHSReg, LHSIsKill); + unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, + RHSReg, RHSIsKill); + ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, + AndReg, /*IsKill=*/true); + } + updateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { + // These are pseudo CMOV instructions and will be later expanded into control- + // flow. + unsigned Opc; + switch (RetVT.SimpleTy) { + default: return false; + case MVT::i8: Opc = X86::CMOV_GR8; break; + case MVT::i16: Opc = X86::CMOV_GR16; break; + case MVT::i32: Opc = X86::CMOV_GR32; break; + case MVT::f32: Opc = X86::CMOV_FR32; break; + case MVT::f64: Opc = X86::CMOV_FR64; break; + } + + const Value *Cond = I->getOperand(0); + X86::CondCode CC = X86::COND_NE; + + // Optimize conditions coming from a compare if both instructions are in the + // same basic block (values defined in other basic blocks may not have + // initialized registers). + const auto *CI = dyn_cast<CmpInst>(Cond); + if (CI && (CI->getParent() == I->getParent())) { + bool NeedSwap; + std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate()); + if (CC > X86::LAST_VALID_COND) + return false; + + const Value *CmpLHS = CI->getOperand(0); + const Value *CmpRHS = CI->getOperand(1); + + if (NeedSwap) + std::swap(CmpLHS, CmpRHS); + + EVT CmpVT = TLI.getValueType(DL, CmpLHS->getType()); + if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc())) + return false; + } else { + unsigned CondReg = getRegForValue(Cond); + if (CondReg == 0) + return false; + bool CondIsKill = hasTrivialKill(Cond); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) + .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); + } + + const Value *LHS = I->getOperand(1); + const Value *RHS = I->getOperand(2); + + unsigned LHSReg = getRegForValue(LHS); + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned RHSReg = getRegForValue(RHS); + bool RHSIsKill = hasTrivialKill(RHS); + + if (!LHSReg || !RHSReg) + return false; + + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + + unsigned ResultReg = + fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC); + updateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86SelectSelect(const Instruction *I) { + MVT RetVT; + if (!isTypeLegal(I->getType(), RetVT)) + return false; + + // Check if we can fold the select. + if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) { + CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); + const Value *Opnd = nullptr; + switch (Predicate) { + default: break; + case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break; + case CmpInst::FCMP_TRUE: Opnd = I->getOperand(1); break; + } + // No need for a select anymore - this is an unconditional move. + if (Opnd) { + unsigned OpReg = getRegForValue(Opnd); + if (OpReg == 0) + return false; + bool OpIsKill = hasTrivialKill(Opnd); + const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(OpReg, getKillRegState(OpIsKill)); + updateValueMap(I, ResultReg); + return true; + } + } + + // First try to use real conditional move instructions. + if (X86FastEmitCMoveSelect(RetVT, I)) + return true; + + // Try to use a sequence of SSE instructions to simulate a conditional move. + if (X86FastEmitSSESelect(RetVT, I)) + return true; + + // Fall-back to pseudo conditional move instructions, which will be later + // converted to control-flow. + if (X86FastEmitPseudoSelect(RetVT, I)) + return true; + + return false; +} + +bool X86FastISel::X86SelectSIToFP(const Instruction *I) { + // The target-independent selection algorithm in FastISel already knows how + // to select a SINT_TO_FP if the target is SSE but not AVX. + // Early exit if the subtarget doesn't have AVX. + if (!Subtarget->hasAVX()) + return false; + + if (!I->getOperand(0)->getType()->isIntegerTy(32)) + return false; + + // Select integer to float/double conversion. + unsigned OpReg = getRegForValue(I->getOperand(0)); + if (OpReg == 0) + return false; + + const TargetRegisterClass *RC = nullptr; + unsigned Opcode; + + if (I->getType()->isDoubleTy()) { + // sitofp int -> double + Opcode = X86::VCVTSI2SDrr; + RC = &X86::FR64RegClass; + } else if (I->getType()->isFloatTy()) { + // sitofp int -> float + Opcode = X86::VCVTSI2SSrr; + RC = &X86::FR32RegClass; + } else + return false; + + unsigned ImplicitDefReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); + unsigned ResultReg = + fastEmitInst_rr(Opcode, RC, ImplicitDefReg, true, OpReg, false); + updateValueMap(I, ResultReg); + return true; +} + +// Helper method used by X86SelectFPExt and X86SelectFPTrunc. +bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I, + unsigned TargetOpc, + const TargetRegisterClass *RC) { + assert((I->getOpcode() == Instruction::FPExt || + I->getOpcode() == Instruction::FPTrunc) && + "Instruction must be an FPExt or FPTrunc!"); + + unsigned OpReg = getRegForValue(I->getOperand(0)); + if (OpReg == 0) + return false; + + unsigned ResultReg = createResultReg(RC); + MachineInstrBuilder MIB; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc), + ResultReg); + if (Subtarget->hasAVX()) + MIB.addReg(OpReg); + MIB.addReg(OpReg); + updateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86SelectFPExt(const Instruction *I) { + if (X86ScalarSSEf64 && I->getType()->isDoubleTy() && + I->getOperand(0)->getType()->isFloatTy()) { + // fpext from float to double. + unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr; + return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR64RegClass); + } + + return false; +} + +bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { + if (X86ScalarSSEf64 && I->getType()->isFloatTy() && + I->getOperand(0)->getType()->isDoubleTy()) { + // fptrunc from double to float. + unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr; + return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR32RegClass); + } + + return false; +} + +bool X86FastISel::X86SelectTrunc(const Instruction *I) { + EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType()); + EVT DstVT = TLI.getValueType(DL, I->getType()); + + // This code only handles truncation to byte. + if (DstVT != MVT::i8 && DstVT != MVT::i1) + return false; + if (!TLI.isTypeLegal(SrcVT)) + return false; + + unsigned InputReg = getRegForValue(I->getOperand(0)); + if (!InputReg) + // Unhandled operand. Halt "fast" selection and bail. + return false; + + if (SrcVT == MVT::i8) { + // Truncate from i8 to i1; no code needed. + updateValueMap(I, InputReg); + return true; + } + + bool KillInputReg = false; + if (!Subtarget->is64Bit()) { + // If we're on x86-32; we can't extract an i8 from a general register. + // First issue a copy to GR16_ABCD or GR32_ABCD. + const TargetRegisterClass *CopyRC = + (SrcVT == MVT::i16) ? &X86::GR16_ABCDRegClass : &X86::GR32_ABCDRegClass; + unsigned CopyReg = createResultReg(CopyRC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg); + InputReg = CopyReg; + KillInputReg = true; + } + + // Issue an extract_subreg. + unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8, + InputReg, KillInputReg, + X86::sub_8bit); + if (!ResultReg) + return false; + + updateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::IsMemcpySmall(uint64_t Len) { + return Len <= (Subtarget->is64Bit() ? 32 : 16); +} + +bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, + X86AddressMode SrcAM, uint64_t Len) { + + // Make sure we don't bloat code by inlining very large memcpy's. + if (!IsMemcpySmall(Len)) + return false; + + bool i64Legal = Subtarget->is64Bit(); + + // We don't care about alignment here since we just emit integer accesses. + while (Len) { + MVT VT; + if (Len >= 8 && i64Legal) + VT = MVT::i64; + else if (Len >= 4) + VT = MVT::i32; + else if (Len >= 2) + VT = MVT::i16; + else + VT = MVT::i8; + + unsigned Reg; + bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg); + RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM); + assert(RV && "Failed to emit load or store??"); + + unsigned Size = VT.getSizeInBits()/8; + Len -= Size; + DestAM.Disp += Size; + SrcAM.Disp += Size; + } + + return true; +} + +bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { + // FIXME: Handle more intrinsics. + switch (II->getIntrinsicID()) { + default: return false; + case Intrinsic::convert_from_fp16: + case Intrinsic::convert_to_fp16: { + if (Subtarget->useSoftFloat() || !Subtarget->hasF16C()) + return false; + + const Value *Op = II->getArgOperand(0); + unsigned InputReg = getRegForValue(Op); + if (InputReg == 0) + return false; + + // F16C only allows converting from float to half and from half to float. + bool IsFloatToHalf = II->getIntrinsicID() == Intrinsic::convert_to_fp16; + if (IsFloatToHalf) { + if (!Op->getType()->isFloatTy()) + return false; + } else { + if (!II->getType()->isFloatTy()) + return false; + } + + unsigned ResultReg = 0; + const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::v8i16); + if (IsFloatToHalf) { + // 'InputReg' is implicitly promoted from register class FR32 to + // register class VR128 by method 'constrainOperandRegClass' which is + // directly called by 'fastEmitInst_ri'. + // Instruction VCVTPS2PHrr takes an extra immediate operand which is + // used to provide rounding control. + InputReg = fastEmitInst_ri(X86::VCVTPS2PHrr, RC, InputReg, false, 0); + + // Move the lower 32-bits of ResultReg to another register of class GR32. + ResultReg = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(X86::VMOVPDI2DIrr), ResultReg) + .addReg(InputReg, RegState::Kill); + + // The result value is in the lower 16-bits of ResultReg. + unsigned RegIdx = X86::sub_16bit; + ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx); + } else { + assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!"); + // Explicitly sign-extend the input to 32-bit. + InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::SIGN_EXTEND, InputReg, + /*Kill=*/false); + + // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr. + InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR, + InputReg, /*Kill=*/true); + + InputReg = fastEmitInst_r(X86::VCVTPH2PSrr, RC, InputReg, /*Kill=*/true); + + // The result value is in the lower 32-bits of ResultReg. + // Emit an explicit copy from register class VR128 to register class FR32. + ResultReg = createResultReg(&X86::FR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(InputReg, RegState::Kill); + } + + updateValueMap(II, ResultReg); + return true; + } + case Intrinsic::frameaddress: { + MachineFunction *MF = FuncInfo.MF; + if (MF->getTarget().getMCAsmInfo()->usesWindowsCFI()) + return false; + + Type *RetTy = II->getCalledFunction()->getReturnType(); + + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + unsigned Opc; + const TargetRegisterClass *RC = nullptr; + + switch (VT.SimpleTy) { + default: llvm_unreachable("Invalid result type for frameaddress."); + case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break; + case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break; + } + + // This needs to be set before we call getPtrSizedFrameRegister, otherwise + // we get the wrong frame register. + MachineFrameInfo *MFI = MF->getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*MF); + assert(((FrameReg == X86::RBP && VT == MVT::i64) || + (FrameReg == X86::EBP && VT == MVT::i32)) && + "Invalid Frame Register!"); + + // Always make a copy of the frame register to to a vreg first, so that we + // never directly reference the frame register (the TwoAddressInstruction- + // Pass doesn't like that). + unsigned SrcReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg); + + // Now recursively load from the frame address. + // movq (%rbp), %rax + // movq (%rax), %rax + // movq (%rax), %rax + // ... + unsigned DestReg; + unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue(); + while (Depth--) { + DestReg = createResultReg(RC); + addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), DestReg), SrcReg); + SrcReg = DestReg; + } + + updateValueMap(II, SrcReg); + return true; + } + case Intrinsic::memcpy: { + const MemCpyInst *MCI = cast<MemCpyInst>(II); + // Don't handle volatile or variable length memcpys. + if (MCI->isVolatile()) + return false; + + if (isa<ConstantInt>(MCI->getLength())) { + // Small memcpy's are common enough that we want to do them + // without a call if possible. + uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue(); + if (IsMemcpySmall(Len)) { + X86AddressMode DestAM, SrcAM; + if (!X86SelectAddress(MCI->getRawDest(), DestAM) || + !X86SelectAddress(MCI->getRawSource(), SrcAM)) + return false; + TryEmitSmallMemcpy(DestAM, SrcAM, Len); + return true; + } + } + + unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; + if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth)) + return false; + + if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255) + return false; + + return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 2); + } + case Intrinsic::memset: { + const MemSetInst *MSI = cast<MemSetInst>(II); + + if (MSI->isVolatile()) + return false; + + unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; + if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth)) + return false; + + if (MSI->getDestAddressSpace() > 255) + return false; + + return lowerCallTo(II, "memset", II->getNumArgOperands() - 2); + } + case Intrinsic::stackprotector: { + // Emit code to store the stack guard onto the stack. + EVT PtrTy = TLI.getPointerTy(DL); + + const Value *Op1 = II->getArgOperand(0); // The guard's value. + const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1)); + + MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]); + + // Grab the frame index. + X86AddressMode AM; + if (!X86SelectAddress(Slot, AM)) return false; + if (!X86FastEmitStore(PtrTy, Op1, AM)) return false; + return true; + } + case Intrinsic::dbg_declare: { + const DbgDeclareInst *DI = cast<DbgDeclareInst>(II); + X86AddressMode AM; + assert(DI->getAddress() && "Null address should be checked earlier!"); + if (!X86SelectAddress(DI->getAddress(), AM)) + return false; + const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE); + // FIXME may need to add RegState::Debug to any registers produced, + // although ESP/EBP should be the only ones at the moment. + assert(DI->getVariable()->isValidLocationForIntrinsic(DbgLoc) && + "Expected inlined-at fields to agree"); + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM) + .addImm(0) + .addMetadata(DI->getVariable()) + .addMetadata(DI->getExpression()); + return true; + } + case Intrinsic::trap: { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP)); + return true; + } + case Intrinsic::sqrt: { + if (!Subtarget->hasSSE1()) + return false; + + Type *RetTy = II->getCalledFunction()->getReturnType(); + + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT + // is not generated by FastISel yet. + // FIXME: Update this code once tablegen can handle it. + static const unsigned SqrtOpc[2][2] = { + {X86::SQRTSSr, X86::VSQRTSSr}, + {X86::SQRTSDr, X86::VSQRTSDr} + }; + bool HasAVX = Subtarget->hasAVX(); + unsigned Opc; + const TargetRegisterClass *RC; + switch (VT.SimpleTy) { + default: return false; + case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break; + case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break; + } + + const Value *SrcVal = II->getArgOperand(0); + unsigned SrcReg = getRegForValue(SrcVal); + + if (SrcReg == 0) + return false; + + unsigned ImplicitDefReg = 0; + if (HasAVX) { + ImplicitDefReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); + } + + unsigned ResultReg = createResultReg(RC); + MachineInstrBuilder MIB; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), + ResultReg); + + if (ImplicitDefReg) + MIB.addReg(ImplicitDefReg); + + MIB.addReg(SrcReg); + + updateValueMap(II, ResultReg); + return true; + } + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: { + // This implements the basic lowering of the xalu with overflow intrinsics + // into add/sub/mul followed by either seto or setb. + const Function *Callee = II->getCalledFunction(); + auto *Ty = cast<StructType>(Callee->getReturnType()); + Type *RetTy = Ty->getTypeAtIndex(0U); + Type *CondTy = Ty->getTypeAtIndex(1); + + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + if (VT < MVT::i8 || VT > MVT::i64) + return false; + + const Value *LHS = II->getArgOperand(0); + const Value *RHS = II->getArgOperand(1); + + // Canonicalize immediate to the RHS. + if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) && + isCommutativeIntrinsic(II)) + std::swap(LHS, RHS); + + bool UseIncDec = false; + if (isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isOne()) + UseIncDec = true; + + unsigned BaseOpc, CondOpc; + switch (II->getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::sadd_with_overflow: + BaseOpc = UseIncDec ? unsigned(X86ISD::INC) : unsigned(ISD::ADD); + CondOpc = X86::SETOr; + break; + case Intrinsic::uadd_with_overflow: + BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break; + case Intrinsic::ssub_with_overflow: + BaseOpc = UseIncDec ? unsigned(X86ISD::DEC) : unsigned(ISD::SUB); + CondOpc = X86::SETOr; + break; + case Intrinsic::usub_with_overflow: + BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break; + case Intrinsic::smul_with_overflow: + BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break; + case Intrinsic::umul_with_overflow: + BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break; + } + + unsigned LHSReg = getRegForValue(LHS); + if (LHSReg == 0) + return false; + bool LHSIsKill = hasTrivialKill(LHS); + + unsigned ResultReg = 0; + // Check if we have an immediate version. + if (const auto *CI = dyn_cast<ConstantInt>(RHS)) { + static const unsigned Opc[2][4] = { + { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r }, + { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r } + }; + + if (BaseOpc == X86ISD::INC || BaseOpc == X86ISD::DEC) { + ResultReg = createResultReg(TLI.getRegClassFor(VT)); + bool IsDec = BaseOpc == X86ISD::DEC; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg) + .addReg(LHSReg, getKillRegState(LHSIsKill)); + } else + ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill, + CI->getZExtValue()); + } + + unsigned RHSReg; + bool RHSIsKill; + if (!ResultReg) { + RHSReg = getRegForValue(RHS); + if (RHSReg == 0) + return false; + RHSIsKill = hasTrivialKill(RHS); + ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg, + RHSIsKill); + } + + // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit + // it manually. + if (BaseOpc == X86ISD::UMUL && !ResultReg) { + static const unsigned MULOpc[] = + { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r }; + static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX }; + // First copy the first operand into RAX, which is an implicit input to + // the X86::MUL*r instruction. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8]) + .addReg(LHSReg, getKillRegState(LHSIsKill)); + ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8], + TLI.getRegClassFor(VT), RHSReg, RHSIsKill); + } else if (BaseOpc == X86ISD::SMUL && !ResultReg) { + static const unsigned MULOpc[] = + { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr }; + if (VT == MVT::i8) { + // Copy the first operand into AL, which is an implicit input to the + // X86::IMUL8r instruction. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), X86::AL) + .addReg(LHSReg, getKillRegState(LHSIsKill)); + ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg, + RHSIsKill); + } else + ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8], + TLI.getRegClassFor(VT), LHSReg, LHSIsKill, + RHSReg, RHSIsKill); + } + + if (!ResultReg) + return false; + + unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy); + assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers."); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc), + ResultReg2); + + updateValueMap(II, ResultReg, 2); + return true; + } + case Intrinsic::x86_sse_cvttss2si: + case Intrinsic::x86_sse_cvttss2si64: + case Intrinsic::x86_sse2_cvttsd2si: + case Intrinsic::x86_sse2_cvttsd2si64: { + bool IsInputDouble; + switch (II->getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic."); + case Intrinsic::x86_sse_cvttss2si: + case Intrinsic::x86_sse_cvttss2si64: + if (!Subtarget->hasSSE1()) + return false; + IsInputDouble = false; + break; + case Intrinsic::x86_sse2_cvttsd2si: + case Intrinsic::x86_sse2_cvttsd2si64: + if (!Subtarget->hasSSE2()) + return false; + IsInputDouble = true; + break; + } + + Type *RetTy = II->getCalledFunction()->getReturnType(); + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + static const unsigned CvtOpc[2][2][2] = { + { { X86::CVTTSS2SIrr, X86::VCVTTSS2SIrr }, + { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr } }, + { { X86::CVTTSD2SIrr, X86::VCVTTSD2SIrr }, + { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr } } + }; + bool HasAVX = Subtarget->hasAVX(); + unsigned Opc; + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected result type."); + case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break; + case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break; + } + + // Check if we can fold insertelement instructions into the convert. + const Value *Op = II->getArgOperand(0); + while (auto *IE = dyn_cast<InsertElementInst>(Op)) { + const Value *Index = IE->getOperand(2); + if (!isa<ConstantInt>(Index)) + break; + unsigned Idx = cast<ConstantInt>(Index)->getZExtValue(); + + if (Idx == 0) { + Op = IE->getOperand(1); + break; + } + Op = IE->getOperand(0); + } + + unsigned Reg = getRegForValue(Op); + if (Reg == 0) + return false; + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addReg(Reg); + + updateValueMap(II, ResultReg); + return true; + } + } +} + +bool X86FastISel::fastLowerArguments() { + if (!FuncInfo.CanLowerReturn) + return false; + + const Function *F = FuncInfo.Fn; + if (F->isVarArg()) + return false; + + CallingConv::ID CC = F->getCallingConv(); + if (CC != CallingConv::C) + return false; + + if (Subtarget->isCallingConvWin64(CC)) + return false; + + if (!Subtarget->is64Bit()) + return false; + + // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments. + unsigned GPRCnt = 0; + unsigned FPRCnt = 0; + unsigned Idx = 0; + for (auto const &Arg : F->args()) { + // The first argument is at index 1. + ++Idx; + if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) || + F->getAttributes().hasAttribute(Idx, Attribute::InReg) || + F->getAttributes().hasAttribute(Idx, Attribute::StructRet) || + F->getAttributes().hasAttribute(Idx, Attribute::Nest)) + return false; + + Type *ArgTy = Arg.getType(); + if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy()) + return false; + + EVT ArgVT = TLI.getValueType(DL, ArgTy); + if (!ArgVT.isSimple()) return false; + switch (ArgVT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i32: + case MVT::i64: + ++GPRCnt; + break; + case MVT::f32: + case MVT::f64: + if (!Subtarget->hasSSE1()) + return false; + ++FPRCnt; + break; + } + + if (GPRCnt > 6) + return false; + + if (FPRCnt > 8) + return false; + } + + static const MCPhysReg GPR32ArgRegs[] = { + X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D + }; + static const MCPhysReg GPR64ArgRegs[] = { + X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9 + }; + static const MCPhysReg XMMArgRegs[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + + unsigned GPRIdx = 0; + unsigned FPRIdx = 0; + for (auto const &Arg : F->args()) { + MVT VT = TLI.getSimpleValueType(DL, Arg.getType()); + const TargetRegisterClass *RC = TLI.getRegClassFor(VT); + unsigned SrcReg; + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected value type."); + case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break; + case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break; + case MVT::f32: // fall-through + case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break; + } + unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); + // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. + // Without this, EmitLiveInCopies may eliminate the livein if its only + // use is a bitcast (which isn't turned into an instruction). + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(DstReg, getKillRegState(true)); + updateValueMap(&Arg, ResultReg); + } + return true; +} + +static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget, + CallingConv::ID CC, + ImmutableCallSite *CS) { + if (Subtarget->is64Bit()) + return 0; + if (Subtarget->getTargetTriple().isOSMSVCRT()) + return 0; + if (CC == CallingConv::Fast || CC == CallingConv::GHC || + CC == CallingConv::HiPE) + return 0; + + if (CS) + if (CS->arg_empty() || !CS->paramHasAttr(1, Attribute::StructRet) || + CS->paramHasAttr(1, Attribute::InReg) || Subtarget->isTargetMCU()) + return 0; + + return 4; +} + +bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { + auto &OutVals = CLI.OutVals; + auto &OutFlags = CLI.OutFlags; + auto &OutRegs = CLI.OutRegs; + auto &Ins = CLI.Ins; + auto &InRegs = CLI.InRegs; + CallingConv::ID CC = CLI.CallConv; + bool &IsTailCall = CLI.IsTailCall; + bool IsVarArg = CLI.IsVarArg; + const Value *Callee = CLI.Callee; + MCSymbol *Symbol = CLI.Symbol; + + bool Is64Bit = Subtarget->is64Bit(); + bool IsWin64 = Subtarget->isCallingConvWin64(CC); + + // Handle only C, fastcc, and webkit_js calling conventions for now. + switch (CC) { + default: return false; + case CallingConv::C: + case CallingConv::Fast: + case CallingConv::WebKit_JS: + case CallingConv::X86_FastCall: + case CallingConv::X86_64_Win64: + case CallingConv::X86_64_SysV: + break; + } + + // Allow SelectionDAG isel to handle tail calls. + if (IsTailCall) + return false; + + // fastcc with -tailcallopt is intended to provide a guaranteed + // tail call optimization. Fastisel doesn't know how to do that. + if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) + return false; + + // Don't know how to handle Win64 varargs yet. Nothing special needed for + // x86-32. Special handling for x86-64 is implemented. + if (IsVarArg && IsWin64) + return false; + + // Don't know about inalloca yet. + if (CLI.CS && CLI.CS->hasInAllocaArgument()) + return false; + + // Fast-isel doesn't know about callee-pop yet. + if (X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg, + TM.Options.GuaranteedTailCallOpt)) + return false; + + SmallVector<MVT, 16> OutVTs; + SmallVector<unsigned, 16> ArgRegs; + + // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra + // instruction. This is safe because it is common to all FastISel supported + // calling conventions on x86. + for (int i = 0, e = OutVals.size(); i != e; ++i) { + Value *&Val = OutVals[i]; + ISD::ArgFlagsTy Flags = OutFlags[i]; + if (auto *CI = dyn_cast<ConstantInt>(Val)) { + if (CI->getBitWidth() < 32) { + if (Flags.isSExt()) + Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext())); + else + Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext())); + } + } + + // Passing bools around ends up doing a trunc to i1 and passing it. + // Codegen this as an argument + "and 1". + MVT VT; + auto *TI = dyn_cast<TruncInst>(Val); + unsigned ResultReg; + if (TI && TI->getType()->isIntegerTy(1) && CLI.CS && + (TI->getParent() == CLI.CS->getInstruction()->getParent()) && + TI->hasOneUse()) { + Value *PrevVal = TI->getOperand(0); + ResultReg = getRegForValue(PrevVal); + + if (!ResultReg) + return false; + + if (!isTypeLegal(PrevVal->getType(), VT)) + return false; + + ResultReg = + fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1); + } else { + if (!isTypeLegal(Val->getType(), VT)) + return false; + ResultReg = getRegForValue(Val); + } + + if (!ResultReg) + return false; + + ArgRegs.push_back(ResultReg); + OutVTs.push_back(VT); + } + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext()); + + // Allocate shadow area for Win64 + if (IsWin64) + CCInfo.AllocateStack(32, 8); + + CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86); + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); + + // Issue CALLSEQ_START + unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) + .addImm(NumBytes).addImm(0); + + // Walk the register/memloc assignments, inserting copies/loads. + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign const &VA = ArgLocs[i]; + const Value *ArgVal = OutVals[VA.getValNo()]; + MVT ArgVT = OutVTs[VA.getValNo()]; + + if (ArgVT == MVT::x86mmx) + return false; + + unsigned ArgReg = ArgRegs[VA.getValNo()]; + + // Promote the value if needed. + switch (VA.getLocInfo()) { + case CCValAssign::Full: break; + case CCValAssign::SExt: { + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); + bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); + assert(Emitted && "Failed to emit a sext!"); (void)Emitted; + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::ZExt: { + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); + bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); + assert(Emitted && "Failed to emit a zext!"); (void)Emitted; + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::AExt: { + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); + bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); + if (!Emitted) + Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); + if (!Emitted) + Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg, + ArgVT, ArgReg); + + assert(Emitted && "Failed to emit a aext!"); (void)Emitted; + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::BCvt: { + ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg, + /*TODO: Kill=*/false); + assert(ArgReg && "Failed to emit a bitcast!"); + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::VExt: + // VExt has not been implemented, so this should be impossible to reach + // for now. However, fallback to Selection DAG isel once implemented. + return false; + case CCValAssign::AExtUpper: + case CCValAssign::SExtUpper: + case CCValAssign::ZExtUpper: + case CCValAssign::FPExt: + llvm_unreachable("Unexpected loc info!"); + case CCValAssign::Indirect: + // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully + // support this. + return false; + } + + if (VA.isRegLoc()) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg); + OutRegs.push_back(VA.getLocReg()); + } else { + assert(VA.isMemLoc()); + + // Don't emit stores for undef values. + if (isa<UndefValue>(ArgVal)) + continue; + + unsigned LocMemOffset = VA.getLocMemOffset(); + X86AddressMode AM; + AM.Base.Reg = RegInfo->getStackRegister(); + AM.Disp = LocMemOffset; + ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()]; + unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset), + MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); + if (Flags.isByVal()) { + X86AddressMode SrcAM; + SrcAM.Base.Reg = ArgReg; + if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize())) + return false; + } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) { + // If this is a really simple value, emit this with the Value* version + // of X86FastEmitStore. If it isn't simple, we don't want to do this, + // as it can cause us to reevaluate the argument. + if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO)) + return false; + } else { + bool ValIsKill = hasTrivialKill(ArgVal); + if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO)) + return false; + } + } + } + + // ELF / PIC requires GOT in the EBX register before function calls via PLT + // GOT pointer. + if (Subtarget->isPICStyleGOT()) { + unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base); + } + + if (Is64Bit && IsVarArg && !IsWin64) { + // From AMD64 ABI document: + // For calls that may call functions that use varargs or stdargs + // (prototype-less calls or calls to functions containing ellipsis (...) in + // the declaration) %al is used as hidden argument to specify the number + // of SSE registers used. The contents of %al do not need to match exactly + // the number of registers, but must be an ubound on the number of SSE + // registers used and is in the range 0 - 8 inclusive. + + // Count the number of XMM registers allocated. + static const MCPhysReg XMMArgRegs[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); + assert((Subtarget->hasSSE1() || !NumXMMRegs) + && "SSE registers cannot be used when SSE is disabled"); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri), + X86::AL).addImm(NumXMMRegs); + } + + // Materialize callee address in a register. FIXME: GV address can be + // handled with a CALLpcrel32 instead. + X86AddressMode CalleeAM; + if (!X86SelectCallAddress(Callee, CalleeAM)) + return false; + + unsigned CalleeOp = 0; + const GlobalValue *GV = nullptr; + if (CalleeAM.GV != nullptr) { + GV = CalleeAM.GV; + } else if (CalleeAM.Base.Reg != 0) { + CalleeOp = CalleeAM.Base.Reg; + } else + return false; + + // Issue the call. + MachineInstrBuilder MIB; + if (CalleeOp) { + // Register-indirect call. + unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)) + .addReg(CalleeOp); + } else { + // Direct call. + assert(GV && "Not a direct call"); + unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32; + + // See if we need any target-specific flags on the GV operand. + unsigned char OpFlags = 0; + + // On ELF targets, in both X86-64 and X86-32 mode, direct calls to + // external symbols most go through the PLT in PIC mode. If the symbol + // has hidden or protected visibility, or if it is static or local, then + // we don't need to use the PLT - we can directly call it. + if (Subtarget->isTargetELF() && + TM.getRelocationModel() == Reloc::PIC_ && + GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { + OpFlags = X86II::MO_PLT; + } else if (Subtarget->isPICStyleStubAny() && + !GV->isStrongDefinitionForLinker() && + (!Subtarget->getTargetTriple().isMacOSX() || + Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { + // PC-relative references to external symbols should go through $stub, + // unless we're building with the leopard linker or later, which + // automatically synthesizes these stubs. + OpFlags = X86II::MO_DARWIN_STUB; + } + + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)); + if (Symbol) + MIB.addSym(Symbol, OpFlags); + else + MIB.addGlobalAddress(GV, 0, OpFlags); + } + + // Add a register mask operand representing the call-preserved registers. + // Proper defs for return values will be added by setPhysRegsDeadExcept(). + MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC)); + + // Add an implicit use GOT pointer in EBX. + if (Subtarget->isPICStyleGOT()) + MIB.addReg(X86::EBX, RegState::Implicit); + + if (Is64Bit && IsVarArg && !IsWin64) + MIB.addReg(X86::AL, RegState::Implicit); + + // Add implicit physical register uses to the call. + for (auto Reg : OutRegs) + MIB.addReg(Reg, RegState::Implicit); + + // Issue CALLSEQ_END + unsigned NumBytesForCalleeToPop = + computeBytesPoppedByCallee(Subtarget, CC, CLI.CS); + unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp)) + .addImm(NumBytes).addImm(NumBytesForCalleeToPop); + + // Now handle call return values. + SmallVector<CCValAssign, 16> RVLocs; + CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs, + CLI.RetTy->getContext()); + CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86); + + // Copy all of the result registers out of their specified physreg. + unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy); + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign &VA = RVLocs[i]; + EVT CopyVT = VA.getValVT(); + unsigned CopyReg = ResultReg + i; + + // If this is x86-64, and we disabled SSE, we can't return FP values + if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && + ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { + report_fatal_error("SSE register return with SSE disabled"); + } + + // If we prefer to use the value in xmm registers, copy it out as f80 and + // use a truncate to move it from fp stack reg to xmm reg. + if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && + isScalarFPTypeInSSEReg(VA.getValVT())) { + CopyVT = MVT::f80; + CopyReg = createResultReg(&X86::RFP80RegClass); + } + + // Copy out the result. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg()); + InRegs.push_back(VA.getLocReg()); + + // Round the f80 to the right size, which also moves it to the appropriate + // xmm register. This is accomplished by storing the f80 value in memory + // and then loading it back. + if (CopyVT != VA.getValVT()) { + EVT ResVT = VA.getValVT(); + unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64; + unsigned MemSize = ResVT.getSizeInBits()/8; + int FI = MFI.CreateStackObject(MemSize, MemSize, false); + addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc)), FI) + .addReg(CopyReg); + Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm; + addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg + i), FI); + } + } + + CLI.ResultReg = ResultReg; + CLI.NumResultRegs = RVLocs.size(); + CLI.Call = MIB; + + return true; +} + +bool +X86FastISel::fastSelectInstruction(const Instruction *I) { + switch (I->getOpcode()) { + default: break; + case Instruction::Load: + return X86SelectLoad(I); + case Instruction::Store: + return X86SelectStore(I); + case Instruction::Ret: + return X86SelectRet(I); + case Instruction::ICmp: + case Instruction::FCmp: + return X86SelectCmp(I); + case Instruction::ZExt: + return X86SelectZExt(I); + case Instruction::Br: + return X86SelectBranch(I); + case Instruction::LShr: + case Instruction::AShr: + case Instruction::Shl: + return X86SelectShift(I); + case Instruction::SDiv: + case Instruction::UDiv: + case Instruction::SRem: + case Instruction::URem: + return X86SelectDivRem(I); + case Instruction::Select: + return X86SelectSelect(I); + case Instruction::Trunc: + return X86SelectTrunc(I); + case Instruction::FPExt: + return X86SelectFPExt(I); + case Instruction::FPTrunc: + return X86SelectFPTrunc(I); + case Instruction::SIToFP: + return X86SelectSIToFP(I); + case Instruction::IntToPtr: // Deliberate fall-through. + case Instruction::PtrToInt: { + EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType()); + EVT DstVT = TLI.getValueType(DL, I->getType()); + if (DstVT.bitsGT(SrcVT)) + return X86SelectZExt(I); + if (DstVT.bitsLT(SrcVT)) + return X86SelectTrunc(I); + unsigned Reg = getRegForValue(I->getOperand(0)); + if (Reg == 0) return false; + updateValueMap(I, Reg); + return true; + } + case Instruction::BitCast: { + // Select SSE2/AVX bitcasts between 128/256 bit vector types. + if (!Subtarget->hasSSE2()) + return false; + + EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType()); + EVT DstVT = TLI.getValueType(DL, I->getType()); + + if (!SrcVT.isSimple() || !DstVT.isSimple()) + return false; + + if (!SrcVT.is128BitVector() && + !(Subtarget->hasAVX() && SrcVT.is256BitVector())) + return false; + + unsigned Reg = getRegForValue(I->getOperand(0)); + if (Reg == 0) + return false; + + // No instruction is needed for conversion. Reuse the register used by + // the fist operand. + updateValueMap(I, Reg); + return true; + } + } + + return false; +} + +unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) { + if (VT > MVT::i64) + return 0; + + uint64_t Imm = CI->getZExtValue(); + if (Imm == 0) { + unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass); + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected value type"); + case MVT::i1: + case MVT::i8: + return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true, + X86::sub_8bit); + case MVT::i16: + return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Kill=*/true, + X86::sub_16bit); + case MVT::i32: + return SrcReg; + case MVT::i64: { + unsigned ResultReg = createResultReg(&X86::GR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) + .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit); + return ResultReg; + } + } + } + + unsigned Opc = 0; + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected value type"); + case MVT::i1: VT = MVT::i8; // fall-through + case MVT::i8: Opc = X86::MOV8ri; break; + case MVT::i16: Opc = X86::MOV16ri; break; + case MVT::i32: Opc = X86::MOV32ri; break; + case MVT::i64: { + if (isUInt<32>(Imm)) + Opc = X86::MOV32ri; + else if (isInt<32>(Imm)) + Opc = X86::MOV64ri32; + else + Opc = X86::MOV64ri; + break; + } + } + if (VT == MVT::i64 && Opc == X86::MOV32ri) { + unsigned SrcReg = fastEmitInst_i(Opc, &X86::GR32RegClass, Imm); + unsigned ResultReg = createResultReg(&X86::GR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) + .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit); + return ResultReg; + } + return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm); +} + +unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { + if (CFP->isNullValue()) + return fastMaterializeFloatZero(CFP); + + // Can't handle alternate code models yet. + CodeModel::Model CM = TM.getCodeModel(); + if (CM != CodeModel::Small && CM != CodeModel::Large) + return 0; + + // Get opcode and regclass of the output for the given load instruction. + unsigned Opc = 0; + const TargetRegisterClass *RC = nullptr; + switch (VT.SimpleTy) { + default: return 0; + case MVT::f32: + if (X86ScalarSSEf32) { + Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; + RC = &X86::FR32RegClass; + } else { + Opc = X86::LD_Fp32m; + RC = &X86::RFP32RegClass; + } + break; + case MVT::f64: + if (X86ScalarSSEf64) { + Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm; + RC = &X86::FR64RegClass; + } else { + Opc = X86::LD_Fp64m; + RC = &X86::RFP64RegClass; + } + break; + case MVT::f80: + // No f80 support yet. + return 0; + } + + // MachineConstantPool wants an explicit alignment. + unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); + if (Align == 0) { + // Alignment of vector types. FIXME! + Align = DL.getTypeAllocSize(CFP->getType()); + } + + // x86-32 PIC requires a PIC base register for constant pools. + unsigned PICBase = 0; + unsigned char OpFlag = 0; + if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic + OpFlag = X86II::MO_PIC_BASE_OFFSET; + PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + } else if (Subtarget->isPICStyleGOT()) { + OpFlag = X86II::MO_GOTOFF; + PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + } else if (Subtarget->isPICStyleRIPRel() && + TM.getCodeModel() == CodeModel::Small) { + PICBase = X86::RIP; + } + + // Create the load from the constant pool. + unsigned CPI = MCP.getConstantPoolIndex(CFP, Align); + unsigned ResultReg = createResultReg(RC); + + if (CM == CodeModel::Large) { + unsigned AddrReg = createResultReg(&X86::GR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri), + AddrReg) + .addConstantPoolIndex(CPI, 0, OpFlag); + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg); + addDirectMem(MIB, AddrReg); + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getConstantPool(*FuncInfo.MF), + MachineMemOperand::MOLoad, DL.getPointerSize(), Align); + MIB->addMemOperand(*FuncInfo.MF, MMO); + return ResultReg; + } + + addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg), + CPI, PICBase, OpFlag); + return ResultReg; +} + +unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) { + // Can't handle alternate code models yet. + if (TM.getCodeModel() != CodeModel::Small) + return 0; + + // Materialize addresses with LEA/MOV instructions. + X86AddressMode AM; + if (X86SelectAddress(GV, AM)) { + // If the expression is just a basereg, then we're done, otherwise we need + // to emit an LEA. + if (AM.BaseType == X86AddressMode::RegBase && + AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr) + return AM.Base.Reg; + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + if (TM.getRelocationModel() == Reloc::Static && + TLI.getPointerTy(DL) == MVT::i64) { + // The displacement code could be more than 32 bits away so we need to use + // an instruction with a 64 bit immediate + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri), + ResultReg) + .addGlobalAddress(GV); + } else { + unsigned Opc = + TLI.getPointerTy(DL) == MVT::i32 + ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r) + : X86::LEA64r; + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg), AM); + } + return ResultReg; + } + return 0; +} + +unsigned X86FastISel::fastMaterializeConstant(const Constant *C) { + EVT CEVT = TLI.getValueType(DL, C->getType(), true); + + // Only handle simple types. + if (!CEVT.isSimple()) + return 0; + MVT VT = CEVT.getSimpleVT(); + + if (const auto *CI = dyn_cast<ConstantInt>(C)) + return X86MaterializeInt(CI, VT); + else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) + return X86MaterializeFP(CFP, VT); + else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) + return X86MaterializeGV(GV, VT); + + return 0; +} + +unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) { + // Fail on dynamic allocas. At this point, getRegForValue has already + // checked its CSE maps, so if we're here trying to handle a dynamic + // alloca, we're not going to succeed. X86SelectAddress has a + // check for dynamic allocas, because it's called directly from + // various places, but targetMaterializeAlloca also needs a check + // in order to avoid recursion between getRegForValue, + // X86SelectAddrss, and targetMaterializeAlloca. + if (!FuncInfo.StaticAllocaMap.count(C)) + return 0; + assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?"); + + X86AddressMode AM; + if (!X86SelectAddress(C, AM)) + return 0; + unsigned Opc = + TLI.getPointerTy(DL) == MVT::i32 + ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r) + : X86::LEA64r; + const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL)); + unsigned ResultReg = createResultReg(RC); + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg), AM); + return ResultReg; +} + +unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) { + MVT VT; + if (!isTypeLegal(CF->getType(), VT)) + return 0; + + // Get opcode and regclass for the given zero. + unsigned Opc = 0; + const TargetRegisterClass *RC = nullptr; + switch (VT.SimpleTy) { + default: return 0; + case MVT::f32: + if (X86ScalarSSEf32) { + Opc = X86::FsFLD0SS; + RC = &X86::FR32RegClass; + } else { + Opc = X86::LD_Fp032; + RC = &X86::RFP32RegClass; + } + break; + case MVT::f64: + if (X86ScalarSSEf64) { + Opc = X86::FsFLD0SD; + RC = &X86::FR64RegClass; + } else { + Opc = X86::LD_Fp064; + RC = &X86::RFP64RegClass; + } + break; + case MVT::f80: + // No f80 support yet. + return 0; + } + + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); + return ResultReg; +} + + +bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, + const LoadInst *LI) { + const Value *Ptr = LI->getPointerOperand(); + X86AddressMode AM; + if (!X86SelectAddress(Ptr, AM)) + return false; + + const X86InstrInfo &XII = (const X86InstrInfo &)TII; + + unsigned Size = DL.getTypeAllocSize(LI->getType()); + unsigned Alignment = LI->getAlignment(); + + if (Alignment == 0) // Ensure that codegen never sees alignment 0 + Alignment = DL.getABITypeAlignment(LI->getType()); + + SmallVector<MachineOperand, 8> AddrOps; + AM.getFullAddress(AddrOps); + + MachineInstr *Result = XII.foldMemoryOperandImpl( + *FuncInfo.MF, MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, Alignment, + /*AllowCommute=*/true); + if (!Result) + return false; + + // The index register could be in the wrong register class. Unfortunately, + // foldMemoryOperandImpl could have commuted the instruction so its not enough + // to just look at OpNo + the offset to the index reg. We actually need to + // scan the instruction to find the index reg and see if its the correct reg + // class. + unsigned OperandNo = 0; + for (MachineInstr::mop_iterator I = Result->operands_begin(), + E = Result->operands_end(); I != E; ++I, ++OperandNo) { + MachineOperand &MO = *I; + if (!MO.isReg() || MO.isDef() || MO.getReg() != AM.IndexReg) + continue; + // Found the index reg, now try to rewrite it. + unsigned IndexReg = constrainOperandRegClass(Result->getDesc(), + MO.getReg(), OperandNo); + if (IndexReg == MO.getReg()) + continue; + MO.setReg(IndexReg); + } + + Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI)); + MI->eraseFromParent(); + return true; +} + + +namespace llvm { + FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) { + return new X86FastISel(funcInfo, libInfo); + } +} diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp new file mode 100644 index 0000000..1dd69e8 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp @@ -0,0 +1,410 @@ +//===-- X86FixupLEAs.cpp - use or replace LEA instructions -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass that finds instructions that can be +// re-written as LEA instructions in order to reduce pipeline delays. +// When optimizing for size it replaces suitable LEAs with INC or DEC. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +using namespace llvm; + +#define DEBUG_TYPE "x86-fixup-LEAs" + +STATISTIC(NumLEAs, "Number of LEA instructions created"); + +namespace { +class FixupLEAPass : public MachineFunctionPass { + enum RegUsageState { RU_NotUsed, RU_Write, RU_Read }; + static char ID; + /// \brief Loop over all of the instructions in the basic block + /// replacing applicable instructions with LEA instructions, + /// where appropriate. + bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI); + + const char *getPassName() const override { return "X86 LEA Fixup"; } + + /// \brief Given a machine register, look for the instruction + /// which writes it in the current basic block. If found, + /// try to replace it with an equivalent LEA instruction. + /// If replacement succeeds, then also process the newly created + /// instruction. + void seekLEAFixup(MachineOperand &p, MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI); + + /// \brief Given a memory access or LEA instruction + /// whose address mode uses a base and/or index register, look for + /// an opportunity to replace the instruction which sets the base or index + /// register with an equivalent LEA instruction. + void processInstruction(MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI); + + /// \brief Given a LEA instruction which is unprofitable + /// on Silvermont try to replace it with an equivalent ADD instruction + void processInstructionForSLM(MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI); + + /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg + /// and convert them to INC or DEC respectively. + bool fixupIncDec(MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI) const; + + /// \brief Determine if an instruction references a machine register + /// and, if so, whether it reads or writes the register. + RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I); + + /// \brief Step backwards through a basic block, looking + /// for an instruction which writes a register within + /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles. + MachineBasicBlock::iterator searchBackwards(MachineOperand &p, + MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI); + + /// \brief if an instruction can be converted to an + /// equivalent LEA, insert the new instruction into the basic block + /// and return a pointer to it. Otherwise, return zero. + MachineInstr *postRAConvertToLEA(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI) const; + +public: + FixupLEAPass() : MachineFunctionPass(ID) {} + + /// \brief Loop over all of the basic blocks, + /// replacing instructions by equivalent LEA instructions + /// if needed and when possible. + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + MachineFunction *MF; + const X86InstrInfo *TII; // Machine instruction info. + bool OptIncDec; + bool OptLEA; +}; +char FixupLEAPass::ID = 0; +} + +MachineInstr * +FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI) const { + MachineInstr *MI = MBBI; + MachineInstr *NewMI; + switch (MI->getOpcode()) { + case X86::MOV32rr: + case X86::MOV64rr: { + const MachineOperand &Src = MI->getOperand(1); + const MachineOperand &Dest = MI->getOperand(0); + NewMI = BuildMI(*MF, MI->getDebugLoc(), + TII->get(MI->getOpcode() == X86::MOV32rr ? X86::LEA32r + : X86::LEA64r)) + .addOperand(Dest) + .addOperand(Src) + .addImm(1) + .addReg(0) + .addImm(0) + .addReg(0); + MFI->insert(MBBI, NewMI); // Insert the new inst + return NewMI; + } + case X86::ADD64ri32: + case X86::ADD64ri8: + case X86::ADD64ri32_DB: + case X86::ADD64ri8_DB: + case X86::ADD32ri: + case X86::ADD32ri8: + case X86::ADD32ri_DB: + case X86::ADD32ri8_DB: + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri_DB: + case X86::ADD16ri8_DB: + if (!MI->getOperand(2).isImm()) { + // convertToThreeAddress will call getImm() + // which requires isImm() to be true + return nullptr; + } + break; + case X86::ADD16rr: + case X86::ADD16rr_DB: + if (MI->getOperand(1).getReg() != MI->getOperand(2).getReg()) { + // if src1 != src2, then convertToThreeAddress will + // need to create a Virtual register, which we cannot do + // after register allocation. + return nullptr; + } + } + return TII->convertToThreeAddress(MFI, MBBI, nullptr); +} + +FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); } + +bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { + MF = &Func; + const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>(); + OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize(); + OptLEA = ST.LEAusesAG() || ST.slowLEA(); + + if (!OptLEA && !OptIncDec) + return false; + + TII = ST.getInstrInfo(); + + DEBUG(dbgs() << "Start X86FixupLEAs\n";); + // Process all basic blocks. + for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I) + processBasicBlock(Func, I); + DEBUG(dbgs() << "End X86FixupLEAs\n";); + + return true; +} + +FixupLEAPass::RegUsageState +FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) { + RegUsageState RegUsage = RU_NotUsed; + MachineInstr *MI = I; + + for (unsigned int i = 0; i < MI->getNumOperands(); ++i) { + MachineOperand &opnd = MI->getOperand(i); + if (opnd.isReg() && opnd.getReg() == p.getReg()) { + if (opnd.isDef()) + return RU_Write; + RegUsage = RU_Read; + } + } + return RegUsage; +} + +/// getPreviousInstr - Given a reference to an instruction in a basic +/// block, return a reference to the previous instruction in the block, +/// wrapping around to the last instruction of the block if the block +/// branches to itself. +static inline bool getPreviousInstr(MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI) { + if (I == MFI->begin()) { + if (MFI->isPredecessor(&*MFI)) { + I = --MFI->end(); + return true; + } else + return false; + } + --I; + return true; +} + +MachineBasicBlock::iterator +FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI) { + int InstrDistance = 1; + MachineBasicBlock::iterator CurInst; + static const int INSTR_DISTANCE_THRESHOLD = 5; + + CurInst = I; + bool Found; + Found = getPreviousInstr(CurInst, MFI); + while (Found && I != CurInst) { + if (CurInst->isCall() || CurInst->isInlineAsm()) + break; + if (InstrDistance > INSTR_DISTANCE_THRESHOLD) + break; // too far back to make a difference + if (usesRegister(p, CurInst) == RU_Write) { + return CurInst; + } + InstrDistance += TII->getInstrLatency( + MF->getSubtarget().getInstrItineraryData(), CurInst); + Found = getPreviousInstr(CurInst, MFI); + } + return nullptr; +} + +static inline bool isLEA(const int opcode) { + return opcode == X86::LEA16r || opcode == X86::LEA32r || + opcode == X86::LEA64r || opcode == X86::LEA64_32r; +} + +/// isLEASimpleIncOrDec - Does this LEA have one these forms: +/// lea %reg, 1(%reg) +/// lea %reg, -1(%reg) +static inline bool isLEASimpleIncOrDec(MachineInstr *LEA) { + unsigned SrcReg = LEA->getOperand(1 + X86::AddrBaseReg).getReg(); + unsigned DstReg = LEA->getOperand(0).getReg(); + unsigned AddrDispOp = 1 + X86::AddrDisp; + return SrcReg == DstReg && + LEA->getOperand(1 + X86::AddrIndexReg).getReg() == 0 && + LEA->getOperand(1 + X86::AddrSegmentReg).getReg() == 0 && + LEA->getOperand(AddrDispOp).isImm() && + (LEA->getOperand(AddrDispOp).getImm() == 1 || + LEA->getOperand(AddrDispOp).getImm() == -1); +} + +bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI) const { + MachineInstr *MI = I; + int Opcode = MI->getOpcode(); + if (!isLEA(Opcode)) + return false; + + if (isLEASimpleIncOrDec(MI) && TII->isSafeToClobberEFLAGS(*MFI, I)) { + int NewOpcode; + bool isINC = MI->getOperand(4).getImm() == 1; + switch (Opcode) { + case X86::LEA16r: + NewOpcode = isINC ? X86::INC16r : X86::DEC16r; + break; + case X86::LEA32r: + case X86::LEA64_32r: + NewOpcode = isINC ? X86::INC32r : X86::DEC32r; + break; + case X86::LEA64r: + NewOpcode = isINC ? X86::INC64r : X86::DEC64r; + break; + } + + MachineInstr *NewMI = + BuildMI(*MFI, I, MI->getDebugLoc(), TII->get(NewOpcode)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)); + MFI->erase(I); + I = static_cast<MachineBasicBlock::iterator>(NewMI); + return true; + } + return false; +} + +void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI) { + // Process a load, store, or LEA instruction. + MachineInstr *MI = I; + int opcode = MI->getOpcode(); + const MCInstrDesc &Desc = MI->getDesc(); + int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags, opcode); + if (AddrOffset >= 0) { + AddrOffset += X86II::getOperandBias(Desc); + MachineOperand &p = MI->getOperand(AddrOffset + X86::AddrBaseReg); + if (p.isReg() && p.getReg() != X86::ESP) { + seekLEAFixup(p, I, MFI); + } + MachineOperand &q = MI->getOperand(AddrOffset + X86::AddrIndexReg); + if (q.isReg() && q.getReg() != X86::ESP) { + seekLEAFixup(q, I, MFI); + } + } +} + +void FixupLEAPass::seekLEAFixup(MachineOperand &p, + MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI) { + MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI); + if (MBI) { + MachineInstr *NewMI = postRAConvertToLEA(MFI, MBI); + if (NewMI) { + ++NumLEAs; + DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump();); + // now to replace with an equivalent LEA... + DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump();); + MFI->erase(MBI); + MachineBasicBlock::iterator J = + static_cast<MachineBasicBlock::iterator>(NewMI); + processInstruction(J, MFI); + } + } +} + +void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI) { + MachineInstr *MI = I; + const int opcode = MI->getOpcode(); + if (!isLEA(opcode)) + return; + if (MI->getOperand(5).getReg() != 0 || !MI->getOperand(4).isImm() || + !TII->isSafeToClobberEFLAGS(*MFI, I)) + return; + const unsigned DstR = MI->getOperand(0).getReg(); + const unsigned SrcR1 = MI->getOperand(1).getReg(); + const unsigned SrcR2 = MI->getOperand(3).getReg(); + if ((SrcR1 == 0 || SrcR1 != DstR) && (SrcR2 == 0 || SrcR2 != DstR)) + return; + if (MI->getOperand(2).getImm() > 1) + return; + int addrr_opcode, addri_opcode; + switch (opcode) { + default: + llvm_unreachable("Unexpected LEA instruction"); + case X86::LEA16r: + addrr_opcode = X86::ADD16rr; + addri_opcode = X86::ADD16ri; + break; + case X86::LEA32r: + addrr_opcode = X86::ADD32rr; + addri_opcode = X86::ADD32ri; + break; + case X86::LEA64_32r: + case X86::LEA64r: + addrr_opcode = X86::ADD64rr; + addri_opcode = X86::ADD64ri32; + break; + } + DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump();); + DEBUG(dbgs() << "FixLEA: Replaced by: ";); + MachineInstr *NewMI = nullptr; + const MachineOperand &Dst = MI->getOperand(0); + // Make ADD instruction for two registers writing to LEA's destination + if (SrcR1 != 0 && SrcR2 != 0) { + const MachineOperand &Src1 = MI->getOperand(SrcR1 == DstR ? 1 : 3); + const MachineOperand &Src2 = MI->getOperand(SrcR1 == DstR ? 3 : 1); + NewMI = BuildMI(*MF, MI->getDebugLoc(), TII->get(addrr_opcode)) + .addOperand(Dst) + .addOperand(Src1) + .addOperand(Src2); + MFI->insert(I, NewMI); + DEBUG(NewMI->dump();); + } + // Make ADD instruction for immediate + if (MI->getOperand(4).getImm() != 0) { + const MachineOperand &SrcR = MI->getOperand(SrcR1 == DstR ? 1 : 3); + NewMI = BuildMI(*MF, MI->getDebugLoc(), TII->get(addri_opcode)) + .addOperand(Dst) + .addOperand(SrcR) + .addImm(MI->getOperand(4).getImm()); + MFI->insert(I, NewMI); + DEBUG(NewMI->dump();); + } + if (NewMI) { + MFI->erase(I); + I = static_cast<MachineBasicBlock::iterator>(NewMI); + } +} + +bool FixupLEAPass::processBasicBlock(MachineFunction &MF, + MachineFunction::iterator MFI) { + + for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) { + if (OptIncDec) + if (fixupIncDec(I, MFI)) + continue; + + if (OptLEA) { + if (MF.getSubtarget<X86Subtarget>().isSLM()) + processInstructionForSLM(I, MFI); + else + processInstruction(I, MFI); + } + } + return false; +} diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp new file mode 100644 index 0000000..97bb8ab --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -0,0 +1,1651 @@ +//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass which converts floating point instructions from +// pseudo registers into register stack instructions. This pass uses live +// variable information to indicate where the FPn registers are used and their +// lifetimes. +// +// The x87 hardware tracks liveness of the stack registers, so it is necessary +// to implement exact liveness tracking between basic blocks. The CFG edges are +// partitioned into bundles where the same FP registers must be live in +// identical stack positions. Instructions are inserted at the end of each basic +// block to rearrange the live registers to match the outgoing bundle. +// +// This approach avoids splitting critical edges at the potential cost of more +// live register shuffling instructions when critical edges are present. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/EdgeBundles.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <algorithm> +#include <bitset> +using namespace llvm; + +#define DEBUG_TYPE "x86-codegen" + +STATISTIC(NumFXCH, "Number of fxch instructions inserted"); +STATISTIC(NumFP , "Number of floating point instructions"); + +namespace { + const unsigned ScratchFPReg = 7; + + struct FPS : public MachineFunctionPass { + static char ID; + FPS() : MachineFunctionPass(ID) { + initializeEdgeBundlesPass(*PassRegistry::getPassRegistry()); + // This is really only to keep valgrind quiet. + // The logic in isLive() is too much for it. + memset(Stack, 0, sizeof(Stack)); + memset(RegMap, 0, sizeof(RegMap)); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<EdgeBundles>(); + AU.addPreservedID(MachineLoopInfoID); + AU.addPreservedID(MachineDominatorsID); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { return "X86 FP Stackifier"; } + + private: + const TargetInstrInfo *TII; // Machine instruction info. + + // Two CFG edges are related if they leave the same block, or enter the same + // block. The transitive closure of an edge under this relation is a + // LiveBundle. It represents a set of CFG edges where the live FP stack + // registers must be allocated identically in the x87 stack. + // + // A LiveBundle is usually all the edges leaving a block, or all the edges + // entering a block, but it can contain more edges if critical edges are + // present. + // + // The set of live FP registers in a LiveBundle is calculated by bundleCFG, + // but the exact mapping of FP registers to stack slots is fixed later. + struct LiveBundle { + // Bit mask of live FP registers. Bit 0 = FP0, bit 1 = FP1, &c. + unsigned Mask; + + // Number of pre-assigned live registers in FixStack. This is 0 when the + // stack order has not yet been fixed. + unsigned FixCount; + + // Assigned stack order for live-in registers. + // FixStack[i] == getStackEntry(i) for all i < FixCount. + unsigned char FixStack[8]; + + LiveBundle() : Mask(0), FixCount(0) {} + + // Have the live registers been assigned a stack order yet? + bool isFixed() const { return !Mask || FixCount; } + }; + + // Numbered LiveBundle structs. LiveBundles[0] is used for all CFG edges + // with no live FP registers. + SmallVector<LiveBundle, 8> LiveBundles; + + // The edge bundle analysis provides indices into the LiveBundles vector. + EdgeBundles *Bundles; + + // Return a bitmask of FP registers in block's live-in list. + static unsigned calcLiveInMask(MachineBasicBlock *MBB) { + unsigned Mask = 0; + for (const auto &LI : MBB->liveins()) { + if (LI.PhysReg < X86::FP0 || LI.PhysReg > X86::FP6) + continue; + Mask |= 1 << (LI.PhysReg - X86::FP0); + } + return Mask; + } + + // Partition all the CFG edges into LiveBundles. + void bundleCFG(MachineFunction &MF); + + MachineBasicBlock *MBB; // Current basic block + + // The hardware keeps track of how many FP registers are live, so we have + // to model that exactly. Usually, each live register corresponds to an + // FP<n> register, but when dealing with calls, returns, and inline + // assembly, it is sometimes necessary to have live scratch registers. + unsigned Stack[8]; // FP<n> Registers in each stack slot... + unsigned StackTop; // The current top of the FP stack. + + enum { + NumFPRegs = 8 // Including scratch pseudo-registers. + }; + + // For each live FP<n> register, point to its Stack[] entry. + // The first entries correspond to FP0-FP6, the rest are scratch registers + // used when we need slightly different live registers than what the + // register allocator thinks. + unsigned RegMap[NumFPRegs]; + + // Set up our stack model to match the incoming registers to MBB. + void setupBlockStack(); + + // Shuffle live registers to match the expectations of successor blocks. + void finishBlockStack(); + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dumpStack() const { + dbgs() << "Stack contents:"; + for (unsigned i = 0; i != StackTop; ++i) { + dbgs() << " FP" << Stack[i]; + assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!"); + } + } +#endif + + /// getSlot - Return the stack slot number a particular register number is + /// in. + unsigned getSlot(unsigned RegNo) const { + assert(RegNo < NumFPRegs && "Regno out of range!"); + return RegMap[RegNo]; + } + + /// isLive - Is RegNo currently live in the stack? + bool isLive(unsigned RegNo) const { + unsigned Slot = getSlot(RegNo); + return Slot < StackTop && Stack[Slot] == RegNo; + } + + /// getStackEntry - Return the X86::FP<n> register in register ST(i). + unsigned getStackEntry(unsigned STi) const { + if (STi >= StackTop) + report_fatal_error("Access past stack top!"); + return Stack[StackTop-1-STi]; + } + + /// getSTReg - Return the X86::ST(i) register which contains the specified + /// FP<RegNo> register. + unsigned getSTReg(unsigned RegNo) const { + return StackTop - 1 - getSlot(RegNo) + X86::ST0; + } + + // pushReg - Push the specified FP<n> register onto the stack. + void pushReg(unsigned Reg) { + assert(Reg < NumFPRegs && "Register number out of range!"); + if (StackTop >= 8) + report_fatal_error("Stack overflow!"); + Stack[StackTop] = Reg; + RegMap[Reg] = StackTop++; + } + + bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; } + void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) { + DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc(); + if (isAtTop(RegNo)) return; + + unsigned STReg = getSTReg(RegNo); + unsigned RegOnTop = getStackEntry(0); + + // Swap the slots the regs are in. + std::swap(RegMap[RegNo], RegMap[RegOnTop]); + + // Swap stack slot contents. + if (RegMap[RegOnTop] >= StackTop) + report_fatal_error("Access past stack top!"); + std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]); + + // Emit an fxch to update the runtime processors version of the state. + BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg); + ++NumFXCH; + } + + void duplicateToTop(unsigned RegNo, unsigned AsReg, MachineInstr *I) { + DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc(); + unsigned STReg = getSTReg(RegNo); + pushReg(AsReg); // New register on top of stack + + BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg); + } + + /// popStackAfter - Pop the current value off of the top of the FP stack + /// after the specified instruction. + void popStackAfter(MachineBasicBlock::iterator &I); + + /// freeStackSlotAfter - Free the specified register from the register + /// stack, so that it is no longer in a register. If the register is + /// currently at the top of the stack, we just pop the current instruction, + /// otherwise we store the current top-of-stack into the specified slot, + /// then pop the top of stack. + void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg); + + /// freeStackSlotBefore - Just the pop, no folding. Return the inserted + /// instruction. + MachineBasicBlock::iterator + freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo); + + /// Adjust the live registers to be the set in Mask. + void adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I); + + /// Shuffle the top FixCount stack entries such that FP reg FixStack[0] is + /// st(0), FP reg FixStack[1] is st(1) etc. + void shuffleStackTop(const unsigned char *FixStack, unsigned FixCount, + MachineBasicBlock::iterator I); + + bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB); + + void handleCall(MachineBasicBlock::iterator &I); + void handleZeroArgFP(MachineBasicBlock::iterator &I); + void handleOneArgFP(MachineBasicBlock::iterator &I); + void handleOneArgFPRW(MachineBasicBlock::iterator &I); + void handleTwoArgFP(MachineBasicBlock::iterator &I); + void handleCompareFP(MachineBasicBlock::iterator &I); + void handleCondMovFP(MachineBasicBlock::iterator &I); + void handleSpecialFP(MachineBasicBlock::iterator &I); + + // Check if a COPY instruction is using FP registers. + static bool isFPCopy(MachineInstr *MI) { + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned SrcReg = MI->getOperand(1).getReg(); + + return X86::RFP80RegClass.contains(DstReg) || + X86::RFP80RegClass.contains(SrcReg); + } + + void setKillFlags(MachineBasicBlock &MBB) const; + }; + char FPS::ID = 0; +} + +FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); } + +/// getFPReg - Return the X86::FPx register number for the specified operand. +/// For example, this returns 3 for X86::FP3. +static unsigned getFPReg(const MachineOperand &MO) { + assert(MO.isReg() && "Expected an FP register!"); + unsigned Reg = MO.getReg(); + assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!"); + return Reg - X86::FP0; +} + +/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP +/// register references into FP stack references. +/// +bool FPS::runOnMachineFunction(MachineFunction &MF) { + // We only need to run this pass if there are any FP registers used in this + // function. If it is all integer, there is nothing for us to do! + bool FPIsUsed = false; + + static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!"); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + for (unsigned i = 0; i <= 6; ++i) + if (!MRI.reg_nodbg_empty(X86::FP0 + i)) { + FPIsUsed = true; + break; + } + + // Early exit. + if (!FPIsUsed) return false; + + Bundles = &getAnalysis<EdgeBundles>(); + TII = MF.getSubtarget().getInstrInfo(); + + // Prepare cross-MBB liveness. + bundleCFG(MF); + + StackTop = 0; + + // Process the function in depth first order so that we process at least one + // of the predecessors for every reachable block in the function. + SmallPtrSet<MachineBasicBlock*, 8> Processed; + MachineBasicBlock *Entry = &MF.front(); + + bool Changed = false; + for (MachineBasicBlock *BB : depth_first_ext(Entry, Processed)) + Changed |= processBasicBlock(MF, *BB); + + // Process any unreachable blocks in arbitrary order now. + if (MF.size() != Processed.size()) + for (MachineBasicBlock &BB : MF) + if (Processed.insert(&BB).second) + Changed |= processBasicBlock(MF, BB); + + LiveBundles.clear(); + + return Changed; +} + +/// bundleCFG - Scan all the basic blocks to determine consistent live-in and +/// live-out sets for the FP registers. Consistent means that the set of +/// registers live-out from a block is identical to the live-in set of all +/// successors. This is not enforced by the normal live-in lists since +/// registers may be implicitly defined, or not used by all successors. +void FPS::bundleCFG(MachineFunction &MF) { + assert(LiveBundles.empty() && "Stale data in LiveBundles"); + LiveBundles.resize(Bundles->getNumBundles()); + + // Gather the actual live-in masks for all MBBs. + for (MachineBasicBlock &MBB : MF) { + const unsigned Mask = calcLiveInMask(&MBB); + if (!Mask) + continue; + // Update MBB ingoing bundle mask. + LiveBundles[Bundles->getBundle(MBB.getNumber(), false)].Mask |= Mask; + } +} + +/// processBasicBlock - Loop over all of the instructions in the basic block, +/// transforming FP instructions into their stack form. +/// +bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { + bool Changed = false; + MBB = &BB; + + setKillFlags(BB); + setupBlockStack(); + + for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { + MachineInstr *MI = I; + uint64_t Flags = MI->getDesc().TSFlags; + + unsigned FPInstClass = Flags & X86II::FPTypeMask; + if (MI->isInlineAsm()) + FPInstClass = X86II::SpecialFP; + + if (MI->isCopy() && isFPCopy(MI)) + FPInstClass = X86II::SpecialFP; + + if (MI->isImplicitDef() && + X86::RFP80RegClass.contains(MI->getOperand(0).getReg())) + FPInstClass = X86II::SpecialFP; + + if (MI->isCall()) + FPInstClass = X86II::SpecialFP; + + if (FPInstClass == X86II::NotFP) + continue; // Efficiently ignore non-fp insts! + + MachineInstr *PrevMI = nullptr; + if (I != BB.begin()) + PrevMI = std::prev(I); + + ++NumFP; // Keep track of # of pseudo instrs + DEBUG(dbgs() << "\nFPInst:\t" << *MI); + + // Get dead variables list now because the MI pointer may be deleted as part + // of processing! + SmallVector<unsigned, 8> DeadRegs; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.isDead()) + DeadRegs.push_back(MO.getReg()); + } + + switch (FPInstClass) { + case X86II::ZeroArgFP: handleZeroArgFP(I); break; + case X86II::OneArgFP: handleOneArgFP(I); break; // fstp ST(0) + case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0)) + case X86II::TwoArgFP: handleTwoArgFP(I); break; + case X86II::CompareFP: handleCompareFP(I); break; + case X86II::CondMovFP: handleCondMovFP(I); break; + case X86II::SpecialFP: handleSpecialFP(I); break; + default: llvm_unreachable("Unknown FP Type!"); + } + + // Check to see if any of the values defined by this instruction are dead + // after definition. If so, pop them. + for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) { + unsigned Reg = DeadRegs[i]; + // Check if Reg is live on the stack. An inline-asm register operand that + // is in the clobber list and marked dead might not be live on the stack. + if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) { + DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n"); + freeStackSlotAfter(I, Reg-X86::FP0); + } + } + + // Print out all of the instructions expanded to if -debug + DEBUG( + MachineBasicBlock::iterator PrevI(PrevMI); + if (I == PrevI) { + dbgs() << "Just deleted pseudo instruction\n"; + } else { + MachineBasicBlock::iterator Start = I; + // Rewind to first instruction newly inserted. + while (Start != BB.begin() && std::prev(Start) != PrevI) --Start; + dbgs() << "Inserted instructions:\n\t"; + Start->print(dbgs()); + while (++Start != std::next(I)) {} + } + dumpStack(); + ); + (void)PrevMI; + + Changed = true; + } + + finishBlockStack(); + + return Changed; +} + +/// setupBlockStack - Use the live bundles to set up our model of the stack +/// to match predecessors' live out stack. +void FPS::setupBlockStack() { + DEBUG(dbgs() << "\nSetting up live-ins for BB#" << MBB->getNumber() + << " derived from " << MBB->getName() << ".\n"); + StackTop = 0; + // Get the live-in bundle for MBB. + const LiveBundle &Bundle = + LiveBundles[Bundles->getBundle(MBB->getNumber(), false)]; + + if (!Bundle.Mask) { + DEBUG(dbgs() << "Block has no FP live-ins.\n"); + return; + } + + // Depth-first iteration should ensure that we always have an assigned stack. + assert(Bundle.isFixed() && "Reached block before any predecessors"); + + // Push the fixed live-in registers. + for (unsigned i = Bundle.FixCount; i > 0; --i) { + MBB->addLiveIn(X86::ST0+i-1); + DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %FP" + << unsigned(Bundle.FixStack[i-1]) << '\n'); + pushReg(Bundle.FixStack[i-1]); + } + + // Kill off unwanted live-ins. This can happen with a critical edge. + // FIXME: We could keep these live registers around as zombies. They may need + // to be revived at the end of a short block. It might save a few instrs. + adjustLiveRegs(calcLiveInMask(MBB), MBB->begin()); + DEBUG(MBB->dump()); +} + +/// finishBlockStack - Revive live-outs that are implicitly defined out of +/// MBB. Shuffle live registers to match the expected fixed stack of any +/// predecessors, and ensure that all predecessors are expecting the same +/// stack. +void FPS::finishBlockStack() { + // The RET handling below takes care of return blocks for us. + if (MBB->succ_empty()) + return; + + DEBUG(dbgs() << "Setting up live-outs for BB#" << MBB->getNumber() + << " derived from " << MBB->getName() << ".\n"); + + // Get MBB's live-out bundle. + unsigned BundleIdx = Bundles->getBundle(MBB->getNumber(), true); + LiveBundle &Bundle = LiveBundles[BundleIdx]; + + // We may need to kill and define some registers to match successors. + // FIXME: This can probably be combined with the shuffle below. + MachineBasicBlock::iterator Term = MBB->getFirstTerminator(); + adjustLiveRegs(Bundle.Mask, Term); + + if (!Bundle.Mask) { + DEBUG(dbgs() << "No live-outs.\n"); + return; + } + + // Has the stack order been fixed yet? + DEBUG(dbgs() << "LB#" << BundleIdx << ": "); + if (Bundle.isFixed()) { + DEBUG(dbgs() << "Shuffling stack to match.\n"); + shuffleStackTop(Bundle.FixStack, Bundle.FixCount, Term); + } else { + // Not fixed yet, we get to choose. + DEBUG(dbgs() << "Fixing stack order now.\n"); + Bundle.FixCount = StackTop; + for (unsigned i = 0; i < StackTop; ++i) + Bundle.FixStack[i] = getStackEntry(i); + } +} + + +//===----------------------------------------------------------------------===// +// Efficient Lookup Table Support +//===----------------------------------------------------------------------===// + +namespace { + struct TableEntry { + uint16_t from; + uint16_t to; + bool operator<(const TableEntry &TE) const { return from < TE.from; } + friend bool operator<(const TableEntry &TE, unsigned V) { + return TE.from < V; + } + friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned V, + const TableEntry &TE) { + return V < TE.from; + } + }; +} + +static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) { + const TableEntry *I = std::lower_bound(Table.begin(), Table.end(), Opcode); + if (I != Table.end() && I->from == Opcode) + return I->to; + return -1; +} + +#ifdef NDEBUG +#define ASSERT_SORTED(TABLE) +#else +#define ASSERT_SORTED(TABLE) \ + { static bool TABLE##Checked = false; \ + if (!TABLE##Checked) { \ + assert(std::is_sorted(std::begin(TABLE), std::end(TABLE)) && \ + "All lookup tables must be sorted for efficient access!"); \ + TABLE##Checked = true; \ + } \ + } +#endif + +//===----------------------------------------------------------------------===// +// Register File -> Register Stack Mapping Methods +//===----------------------------------------------------------------------===// + +// OpcodeTable - Sorted map of register instructions to their stack version. +// The first element is an register file pseudo instruction, the second is the +// concrete X86 instruction which uses the register stack. +// +static const TableEntry OpcodeTable[] = { + { X86::ABS_Fp32 , X86::ABS_F }, + { X86::ABS_Fp64 , X86::ABS_F }, + { X86::ABS_Fp80 , X86::ABS_F }, + { X86::ADD_Fp32m , X86::ADD_F32m }, + { X86::ADD_Fp64m , X86::ADD_F64m }, + { X86::ADD_Fp64m32 , X86::ADD_F32m }, + { X86::ADD_Fp80m32 , X86::ADD_F32m }, + { X86::ADD_Fp80m64 , X86::ADD_F64m }, + { X86::ADD_FpI16m32 , X86::ADD_FI16m }, + { X86::ADD_FpI16m64 , X86::ADD_FI16m }, + { X86::ADD_FpI16m80 , X86::ADD_FI16m }, + { X86::ADD_FpI32m32 , X86::ADD_FI32m }, + { X86::ADD_FpI32m64 , X86::ADD_FI32m }, + { X86::ADD_FpI32m80 , X86::ADD_FI32m }, + { X86::CHS_Fp32 , X86::CHS_F }, + { X86::CHS_Fp64 , X86::CHS_F }, + { X86::CHS_Fp80 , X86::CHS_F }, + { X86::CMOVBE_Fp32 , X86::CMOVBE_F }, + { X86::CMOVBE_Fp64 , X86::CMOVBE_F }, + { X86::CMOVBE_Fp80 , X86::CMOVBE_F }, + { X86::CMOVB_Fp32 , X86::CMOVB_F }, + { X86::CMOVB_Fp64 , X86::CMOVB_F }, + { X86::CMOVB_Fp80 , X86::CMOVB_F }, + { X86::CMOVE_Fp32 , X86::CMOVE_F }, + { X86::CMOVE_Fp64 , X86::CMOVE_F }, + { X86::CMOVE_Fp80 , X86::CMOVE_F }, + { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F }, + { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F }, + { X86::CMOVNBE_Fp80 , X86::CMOVNBE_F }, + { X86::CMOVNB_Fp32 , X86::CMOVNB_F }, + { X86::CMOVNB_Fp64 , X86::CMOVNB_F }, + { X86::CMOVNB_Fp80 , X86::CMOVNB_F }, + { X86::CMOVNE_Fp32 , X86::CMOVNE_F }, + { X86::CMOVNE_Fp64 , X86::CMOVNE_F }, + { X86::CMOVNE_Fp80 , X86::CMOVNE_F }, + { X86::CMOVNP_Fp32 , X86::CMOVNP_F }, + { X86::CMOVNP_Fp64 , X86::CMOVNP_F }, + { X86::CMOVNP_Fp80 , X86::CMOVNP_F }, + { X86::CMOVP_Fp32 , X86::CMOVP_F }, + { X86::CMOVP_Fp64 , X86::CMOVP_F }, + { X86::CMOVP_Fp80 , X86::CMOVP_F }, + { X86::COS_Fp32 , X86::COS_F }, + { X86::COS_Fp64 , X86::COS_F }, + { X86::COS_Fp80 , X86::COS_F }, + { X86::DIVR_Fp32m , X86::DIVR_F32m }, + { X86::DIVR_Fp64m , X86::DIVR_F64m }, + { X86::DIVR_Fp64m32 , X86::DIVR_F32m }, + { X86::DIVR_Fp80m32 , X86::DIVR_F32m }, + { X86::DIVR_Fp80m64 , X86::DIVR_F64m }, + { X86::DIVR_FpI16m32, X86::DIVR_FI16m}, + { X86::DIVR_FpI16m64, X86::DIVR_FI16m}, + { X86::DIVR_FpI16m80, X86::DIVR_FI16m}, + { X86::DIVR_FpI32m32, X86::DIVR_FI32m}, + { X86::DIVR_FpI32m64, X86::DIVR_FI32m}, + { X86::DIVR_FpI32m80, X86::DIVR_FI32m}, + { X86::DIV_Fp32m , X86::DIV_F32m }, + { X86::DIV_Fp64m , X86::DIV_F64m }, + { X86::DIV_Fp64m32 , X86::DIV_F32m }, + { X86::DIV_Fp80m32 , X86::DIV_F32m }, + { X86::DIV_Fp80m64 , X86::DIV_F64m }, + { X86::DIV_FpI16m32 , X86::DIV_FI16m }, + { X86::DIV_FpI16m64 , X86::DIV_FI16m }, + { X86::DIV_FpI16m80 , X86::DIV_FI16m }, + { X86::DIV_FpI32m32 , X86::DIV_FI32m }, + { X86::DIV_FpI32m64 , X86::DIV_FI32m }, + { X86::DIV_FpI32m80 , X86::DIV_FI32m }, + { X86::ILD_Fp16m32 , X86::ILD_F16m }, + { X86::ILD_Fp16m64 , X86::ILD_F16m }, + { X86::ILD_Fp16m80 , X86::ILD_F16m }, + { X86::ILD_Fp32m32 , X86::ILD_F32m }, + { X86::ILD_Fp32m64 , X86::ILD_F32m }, + { X86::ILD_Fp32m80 , X86::ILD_F32m }, + { X86::ILD_Fp64m32 , X86::ILD_F64m }, + { X86::ILD_Fp64m64 , X86::ILD_F64m }, + { X86::ILD_Fp64m80 , X86::ILD_F64m }, + { X86::ISTT_Fp16m32 , X86::ISTT_FP16m}, + { X86::ISTT_Fp16m64 , X86::ISTT_FP16m}, + { X86::ISTT_Fp16m80 , X86::ISTT_FP16m}, + { X86::ISTT_Fp32m32 , X86::ISTT_FP32m}, + { X86::ISTT_Fp32m64 , X86::ISTT_FP32m}, + { X86::ISTT_Fp32m80 , X86::ISTT_FP32m}, + { X86::ISTT_Fp64m32 , X86::ISTT_FP64m}, + { X86::ISTT_Fp64m64 , X86::ISTT_FP64m}, + { X86::ISTT_Fp64m80 , X86::ISTT_FP64m}, + { X86::IST_Fp16m32 , X86::IST_F16m }, + { X86::IST_Fp16m64 , X86::IST_F16m }, + { X86::IST_Fp16m80 , X86::IST_F16m }, + { X86::IST_Fp32m32 , X86::IST_F32m }, + { X86::IST_Fp32m64 , X86::IST_F32m }, + { X86::IST_Fp32m80 , X86::IST_F32m }, + { X86::IST_Fp64m32 , X86::IST_FP64m }, + { X86::IST_Fp64m64 , X86::IST_FP64m }, + { X86::IST_Fp64m80 , X86::IST_FP64m }, + { X86::LD_Fp032 , X86::LD_F0 }, + { X86::LD_Fp064 , X86::LD_F0 }, + { X86::LD_Fp080 , X86::LD_F0 }, + { X86::LD_Fp132 , X86::LD_F1 }, + { X86::LD_Fp164 , X86::LD_F1 }, + { X86::LD_Fp180 , X86::LD_F1 }, + { X86::LD_Fp32m , X86::LD_F32m }, + { X86::LD_Fp32m64 , X86::LD_F32m }, + { X86::LD_Fp32m80 , X86::LD_F32m }, + { X86::LD_Fp64m , X86::LD_F64m }, + { X86::LD_Fp64m80 , X86::LD_F64m }, + { X86::LD_Fp80m , X86::LD_F80m }, + { X86::MUL_Fp32m , X86::MUL_F32m }, + { X86::MUL_Fp64m , X86::MUL_F64m }, + { X86::MUL_Fp64m32 , X86::MUL_F32m }, + { X86::MUL_Fp80m32 , X86::MUL_F32m }, + { X86::MUL_Fp80m64 , X86::MUL_F64m }, + { X86::MUL_FpI16m32 , X86::MUL_FI16m }, + { X86::MUL_FpI16m64 , X86::MUL_FI16m }, + { X86::MUL_FpI16m80 , X86::MUL_FI16m }, + { X86::MUL_FpI32m32 , X86::MUL_FI32m }, + { X86::MUL_FpI32m64 , X86::MUL_FI32m }, + { X86::MUL_FpI32m80 , X86::MUL_FI32m }, + { X86::SIN_Fp32 , X86::SIN_F }, + { X86::SIN_Fp64 , X86::SIN_F }, + { X86::SIN_Fp80 , X86::SIN_F }, + { X86::SQRT_Fp32 , X86::SQRT_F }, + { X86::SQRT_Fp64 , X86::SQRT_F }, + { X86::SQRT_Fp80 , X86::SQRT_F }, + { X86::ST_Fp32m , X86::ST_F32m }, + { X86::ST_Fp64m , X86::ST_F64m }, + { X86::ST_Fp64m32 , X86::ST_F32m }, + { X86::ST_Fp80m32 , X86::ST_F32m }, + { X86::ST_Fp80m64 , X86::ST_F64m }, + { X86::ST_FpP80m , X86::ST_FP80m }, + { X86::SUBR_Fp32m , X86::SUBR_F32m }, + { X86::SUBR_Fp64m , X86::SUBR_F64m }, + { X86::SUBR_Fp64m32 , X86::SUBR_F32m }, + { X86::SUBR_Fp80m32 , X86::SUBR_F32m }, + { X86::SUBR_Fp80m64 , X86::SUBR_F64m }, + { X86::SUBR_FpI16m32, X86::SUBR_FI16m}, + { X86::SUBR_FpI16m64, X86::SUBR_FI16m}, + { X86::SUBR_FpI16m80, X86::SUBR_FI16m}, + { X86::SUBR_FpI32m32, X86::SUBR_FI32m}, + { X86::SUBR_FpI32m64, X86::SUBR_FI32m}, + { X86::SUBR_FpI32m80, X86::SUBR_FI32m}, + { X86::SUB_Fp32m , X86::SUB_F32m }, + { X86::SUB_Fp64m , X86::SUB_F64m }, + { X86::SUB_Fp64m32 , X86::SUB_F32m }, + { X86::SUB_Fp80m32 , X86::SUB_F32m }, + { X86::SUB_Fp80m64 , X86::SUB_F64m }, + { X86::SUB_FpI16m32 , X86::SUB_FI16m }, + { X86::SUB_FpI16m64 , X86::SUB_FI16m }, + { X86::SUB_FpI16m80 , X86::SUB_FI16m }, + { X86::SUB_FpI32m32 , X86::SUB_FI32m }, + { X86::SUB_FpI32m64 , X86::SUB_FI32m }, + { X86::SUB_FpI32m80 , X86::SUB_FI32m }, + { X86::TST_Fp32 , X86::TST_F }, + { X86::TST_Fp64 , X86::TST_F }, + { X86::TST_Fp80 , X86::TST_F }, + { X86::UCOM_FpIr32 , X86::UCOM_FIr }, + { X86::UCOM_FpIr64 , X86::UCOM_FIr }, + { X86::UCOM_FpIr80 , X86::UCOM_FIr }, + { X86::UCOM_Fpr32 , X86::UCOM_Fr }, + { X86::UCOM_Fpr64 , X86::UCOM_Fr }, + { X86::UCOM_Fpr80 , X86::UCOM_Fr }, +}; + +static unsigned getConcreteOpcode(unsigned Opcode) { + ASSERT_SORTED(OpcodeTable); + int Opc = Lookup(OpcodeTable, Opcode); + assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!"); + return Opc; +} + +//===----------------------------------------------------------------------===// +// Helper Methods +//===----------------------------------------------------------------------===// + +// PopTable - Sorted map of instructions to their popping version. The first +// element is an instruction, the second is the version which pops. +// +static const TableEntry PopTable[] = { + { X86::ADD_FrST0 , X86::ADD_FPrST0 }, + + { X86::DIVR_FrST0, X86::DIVR_FPrST0 }, + { X86::DIV_FrST0 , X86::DIV_FPrST0 }, + + { X86::IST_F16m , X86::IST_FP16m }, + { X86::IST_F32m , X86::IST_FP32m }, + + { X86::MUL_FrST0 , X86::MUL_FPrST0 }, + + { X86::ST_F32m , X86::ST_FP32m }, + { X86::ST_F64m , X86::ST_FP64m }, + { X86::ST_Frr , X86::ST_FPrr }, + + { X86::SUBR_FrST0, X86::SUBR_FPrST0 }, + { X86::SUB_FrST0 , X86::SUB_FPrST0 }, + + { X86::UCOM_FIr , X86::UCOM_FIPr }, + + { X86::UCOM_FPr , X86::UCOM_FPPr }, + { X86::UCOM_Fr , X86::UCOM_FPr }, +}; + +/// popStackAfter - Pop the current value off of the top of the FP stack after +/// the specified instruction. This attempts to be sneaky and combine the pop +/// into the instruction itself if possible. The iterator is left pointing to +/// the last instruction, be it a new pop instruction inserted, or the old +/// instruction if it was modified in place. +/// +void FPS::popStackAfter(MachineBasicBlock::iterator &I) { + MachineInstr* MI = I; + DebugLoc dl = MI->getDebugLoc(); + ASSERT_SORTED(PopTable); + if (StackTop == 0) + report_fatal_error("Cannot pop empty stack!"); + RegMap[Stack[--StackTop]] = ~0; // Update state + + // Check to see if there is a popping version of this instruction... + int Opcode = Lookup(PopTable, I->getOpcode()); + if (Opcode != -1) { + I->setDesc(TII->get(Opcode)); + if (Opcode == X86::UCOM_FPPr) + I->RemoveOperand(0); + } else { // Insert an explicit pop + I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(X86::ST0); + } +} + +/// freeStackSlotAfter - Free the specified register from the register stack, so +/// that it is no longer in a register. If the register is currently at the top +/// of the stack, we just pop the current instruction, otherwise we store the +/// current top-of-stack into the specified slot, then pop the top of stack. +void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) { + if (getStackEntry(0) == FPRegNo) { // already at the top of stack? easy. + popStackAfter(I); + return; + } + + // Otherwise, store the top of stack into the dead slot, killing the operand + // without having to add in an explicit xchg then pop. + // + I = freeStackSlotBefore(++I, FPRegNo); +} + +/// freeStackSlotBefore - Free the specified register without trying any +/// folding. +MachineBasicBlock::iterator +FPS::freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo) { + unsigned STReg = getSTReg(FPRegNo); + unsigned OldSlot = getSlot(FPRegNo); + unsigned TopReg = Stack[StackTop-1]; + Stack[OldSlot] = TopReg; + RegMap[TopReg] = OldSlot; + RegMap[FPRegNo] = ~0; + Stack[--StackTop] = ~0; + return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr)) + .addReg(STReg) + .getInstr(); +} + +/// adjustLiveRegs - Kill and revive registers such that exactly the FP +/// registers with a bit in Mask are live. +void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) { + unsigned Defs = Mask; + unsigned Kills = 0; + for (unsigned i = 0; i < StackTop; ++i) { + unsigned RegNo = Stack[i]; + if (!(Defs & (1 << RegNo))) + // This register is live, but we don't want it. + Kills |= (1 << RegNo); + else + // We don't need to imp-def this live register. + Defs &= ~(1 << RegNo); + } + assert((Kills & Defs) == 0 && "Register needs killing and def'ing?"); + + // Produce implicit-defs for free by using killed registers. + while (Kills && Defs) { + unsigned KReg = countTrailingZeros(Kills); + unsigned DReg = countTrailingZeros(Defs); + DEBUG(dbgs() << "Renaming %FP" << KReg << " as imp %FP" << DReg << "\n"); + std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]); + std::swap(RegMap[KReg], RegMap[DReg]); + Kills &= ~(1 << KReg); + Defs &= ~(1 << DReg); + } + + // Kill registers by popping. + if (Kills && I != MBB->begin()) { + MachineBasicBlock::iterator I2 = std::prev(I); + while (StackTop) { + unsigned KReg = getStackEntry(0); + if (!(Kills & (1 << KReg))) + break; + DEBUG(dbgs() << "Popping %FP" << KReg << "\n"); + popStackAfter(I2); + Kills &= ~(1 << KReg); + } + } + + // Manually kill the rest. + while (Kills) { + unsigned KReg = countTrailingZeros(Kills); + DEBUG(dbgs() << "Killing %FP" << KReg << "\n"); + freeStackSlotBefore(I, KReg); + Kills &= ~(1 << KReg); + } + + // Load zeros for all the imp-defs. + while(Defs) { + unsigned DReg = countTrailingZeros(Defs); + DEBUG(dbgs() << "Defining %FP" << DReg << " as 0\n"); + BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0)); + pushReg(DReg); + Defs &= ~(1 << DReg); + } + + // Now we should have the correct registers live. + DEBUG(dumpStack()); + assert(StackTop == countPopulation(Mask) && "Live count mismatch"); +} + +/// shuffleStackTop - emit fxch instructions before I to shuffle the top +/// FixCount entries into the order given by FixStack. +/// FIXME: Is there a better algorithm than insertion sort? +void FPS::shuffleStackTop(const unsigned char *FixStack, + unsigned FixCount, + MachineBasicBlock::iterator I) { + // Move items into place, starting from the desired stack bottom. + while (FixCount--) { + // Old register at position FixCount. + unsigned OldReg = getStackEntry(FixCount); + // Desired register at position FixCount. + unsigned Reg = FixStack[FixCount]; + if (Reg == OldReg) + continue; + // (Reg st0) (OldReg st0) = (Reg OldReg st0) + moveToTop(Reg, I); + if (FixCount > 0) + moveToTop(OldReg, I); + } + DEBUG(dumpStack()); +} + + +//===----------------------------------------------------------------------===// +// Instruction transformation implementation +//===----------------------------------------------------------------------===// + +void FPS::handleCall(MachineBasicBlock::iterator &I) { + unsigned STReturns = 0; + + for (const auto &MO : I->operands()) { + if (!MO.isReg()) + continue; + + unsigned R = MO.getReg() - X86::FP0; + + if (R < 8) { + assert(MO.isDef() && MO.isImplicit()); + STReturns |= 1 << R; + } + } + + unsigned N = countTrailingOnes(STReturns); + + // FP registers used for function return must be consecutive starting at + // FP0. + assert(STReturns == 0 || (isMask_32(STReturns) && N <= 2)); + + for (unsigned I = 0; I < N; ++I) + pushReg(N - I - 1); +} + +/// handleZeroArgFP - ST(0) = fld0 ST(0) = flds <mem> +/// +void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) { + MachineInstr *MI = I; + unsigned DestReg = getFPReg(MI->getOperand(0)); + + // Change from the pseudo instruction to the concrete instruction. + MI->RemoveOperand(0); // Remove the explicit ST(0) operand + MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode()))); + + // Result gets pushed on the stack. + pushReg(DestReg); +} + +/// handleOneArgFP - fst <mem>, ST(0) +/// +void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) { + MachineInstr *MI = I; + unsigned NumOps = MI->getDesc().getNumOperands(); + assert((NumOps == X86::AddrNumOperands + 1 || NumOps == 1) && + "Can only handle fst* & ftst instructions!"); + + // Is this the last use of the source register? + unsigned Reg = getFPReg(MI->getOperand(NumOps-1)); + bool KillsSrc = MI->killsRegister(X86::FP0+Reg); + + // FISTP64m is strange because there isn't a non-popping versions. + // If we have one _and_ we don't want to pop the operand, duplicate the value + // on the stack instead of moving it. This ensure that popping the value is + // always ok. + // Ditto FISTTP16m, FISTTP32m, FISTTP64m, ST_FpP80m. + // + if (!KillsSrc && + (MI->getOpcode() == X86::IST_Fp64m32 || + MI->getOpcode() == X86::ISTT_Fp16m32 || + MI->getOpcode() == X86::ISTT_Fp32m32 || + MI->getOpcode() == X86::ISTT_Fp64m32 || + MI->getOpcode() == X86::IST_Fp64m64 || + MI->getOpcode() == X86::ISTT_Fp16m64 || + MI->getOpcode() == X86::ISTT_Fp32m64 || + MI->getOpcode() == X86::ISTT_Fp64m64 || + MI->getOpcode() == X86::IST_Fp64m80 || + MI->getOpcode() == X86::ISTT_Fp16m80 || + MI->getOpcode() == X86::ISTT_Fp32m80 || + MI->getOpcode() == X86::ISTT_Fp64m80 || + MI->getOpcode() == X86::ST_FpP80m)) { + duplicateToTop(Reg, ScratchFPReg, I); + } else { + moveToTop(Reg, I); // Move to the top of the stack... + } + + // Convert from the pseudo instruction to the concrete instruction. + MI->RemoveOperand(NumOps-1); // Remove explicit ST(0) operand + MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode()))); + + if (MI->getOpcode() == X86::IST_FP64m || + MI->getOpcode() == X86::ISTT_FP16m || + MI->getOpcode() == X86::ISTT_FP32m || + MI->getOpcode() == X86::ISTT_FP64m || + MI->getOpcode() == X86::ST_FP80m) { + if (StackTop == 0) + report_fatal_error("Stack empty??"); + --StackTop; + } else if (KillsSrc) { // Last use of operand? + popStackAfter(I); + } +} + + +/// handleOneArgFPRW: Handle instructions that read from the top of stack and +/// replace the value with a newly computed value. These instructions may have +/// non-fp operands after their FP operands. +/// +/// Examples: +/// R1 = fchs R2 +/// R1 = fadd R2, [mem] +/// +void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) { + MachineInstr *MI = I; +#ifndef NDEBUG + unsigned NumOps = MI->getDesc().getNumOperands(); + assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!"); +#endif + + // Is this the last use of the source register? + unsigned Reg = getFPReg(MI->getOperand(1)); + bool KillsSrc = MI->killsRegister(X86::FP0+Reg); + + if (KillsSrc) { + // If this is the last use of the source register, just make sure it's on + // the top of the stack. + moveToTop(Reg, I); + if (StackTop == 0) + report_fatal_error("Stack cannot be empty!"); + --StackTop; + pushReg(getFPReg(MI->getOperand(0))); + } else { + // If this is not the last use of the source register, _copy_ it to the top + // of the stack. + duplicateToTop(Reg, getFPReg(MI->getOperand(0)), I); + } + + // Change from the pseudo instruction to the concrete instruction. + MI->RemoveOperand(1); // Drop the source operand. + MI->RemoveOperand(0); // Drop the destination operand. + MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode()))); +} + + +//===----------------------------------------------------------------------===// +// Define tables of various ways to map pseudo instructions +// + +// ForwardST0Table - Map: A = B op C into: ST(0) = ST(0) op ST(i) +static const TableEntry ForwardST0Table[] = { + { X86::ADD_Fp32 , X86::ADD_FST0r }, + { X86::ADD_Fp64 , X86::ADD_FST0r }, + { X86::ADD_Fp80 , X86::ADD_FST0r }, + { X86::DIV_Fp32 , X86::DIV_FST0r }, + { X86::DIV_Fp64 , X86::DIV_FST0r }, + { X86::DIV_Fp80 , X86::DIV_FST0r }, + { X86::MUL_Fp32 , X86::MUL_FST0r }, + { X86::MUL_Fp64 , X86::MUL_FST0r }, + { X86::MUL_Fp80 , X86::MUL_FST0r }, + { X86::SUB_Fp32 , X86::SUB_FST0r }, + { X86::SUB_Fp64 , X86::SUB_FST0r }, + { X86::SUB_Fp80 , X86::SUB_FST0r }, +}; + +// ReverseST0Table - Map: A = B op C into: ST(0) = ST(i) op ST(0) +static const TableEntry ReverseST0Table[] = { + { X86::ADD_Fp32 , X86::ADD_FST0r }, // commutative + { X86::ADD_Fp64 , X86::ADD_FST0r }, // commutative + { X86::ADD_Fp80 , X86::ADD_FST0r }, // commutative + { X86::DIV_Fp32 , X86::DIVR_FST0r }, + { X86::DIV_Fp64 , X86::DIVR_FST0r }, + { X86::DIV_Fp80 , X86::DIVR_FST0r }, + { X86::MUL_Fp32 , X86::MUL_FST0r }, // commutative + { X86::MUL_Fp64 , X86::MUL_FST0r }, // commutative + { X86::MUL_Fp80 , X86::MUL_FST0r }, // commutative + { X86::SUB_Fp32 , X86::SUBR_FST0r }, + { X86::SUB_Fp64 , X86::SUBR_FST0r }, + { X86::SUB_Fp80 , X86::SUBR_FST0r }, +}; + +// ForwardSTiTable - Map: A = B op C into: ST(i) = ST(0) op ST(i) +static const TableEntry ForwardSTiTable[] = { + { X86::ADD_Fp32 , X86::ADD_FrST0 }, // commutative + { X86::ADD_Fp64 , X86::ADD_FrST0 }, // commutative + { X86::ADD_Fp80 , X86::ADD_FrST0 }, // commutative + { X86::DIV_Fp32 , X86::DIVR_FrST0 }, + { X86::DIV_Fp64 , X86::DIVR_FrST0 }, + { X86::DIV_Fp80 , X86::DIVR_FrST0 }, + { X86::MUL_Fp32 , X86::MUL_FrST0 }, // commutative + { X86::MUL_Fp64 , X86::MUL_FrST0 }, // commutative + { X86::MUL_Fp80 , X86::MUL_FrST0 }, // commutative + { X86::SUB_Fp32 , X86::SUBR_FrST0 }, + { X86::SUB_Fp64 , X86::SUBR_FrST0 }, + { X86::SUB_Fp80 , X86::SUBR_FrST0 }, +}; + +// ReverseSTiTable - Map: A = B op C into: ST(i) = ST(i) op ST(0) +static const TableEntry ReverseSTiTable[] = { + { X86::ADD_Fp32 , X86::ADD_FrST0 }, + { X86::ADD_Fp64 , X86::ADD_FrST0 }, + { X86::ADD_Fp80 , X86::ADD_FrST0 }, + { X86::DIV_Fp32 , X86::DIV_FrST0 }, + { X86::DIV_Fp64 , X86::DIV_FrST0 }, + { X86::DIV_Fp80 , X86::DIV_FrST0 }, + { X86::MUL_Fp32 , X86::MUL_FrST0 }, + { X86::MUL_Fp64 , X86::MUL_FrST0 }, + { X86::MUL_Fp80 , X86::MUL_FrST0 }, + { X86::SUB_Fp32 , X86::SUB_FrST0 }, + { X86::SUB_Fp64 , X86::SUB_FrST0 }, + { X86::SUB_Fp80 , X86::SUB_FrST0 }, +}; + + +/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual +/// instructions which need to be simplified and possibly transformed. +/// +/// Result: ST(0) = fsub ST(0), ST(i) +/// ST(i) = fsub ST(0), ST(i) +/// ST(0) = fsubr ST(0), ST(i) +/// ST(i) = fsubr ST(0), ST(i) +/// +void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) { + ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table); + ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable); + MachineInstr *MI = I; + + unsigned NumOperands = MI->getDesc().getNumOperands(); + assert(NumOperands == 3 && "Illegal TwoArgFP instruction!"); + unsigned Dest = getFPReg(MI->getOperand(0)); + unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2)); + unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1)); + bool KillsOp0 = MI->killsRegister(X86::FP0+Op0); + bool KillsOp1 = MI->killsRegister(X86::FP0+Op1); + DebugLoc dl = MI->getDebugLoc(); + + unsigned TOS = getStackEntry(0); + + // One of our operands must be on the top of the stack. If neither is yet, we + // need to move one. + if (Op0 != TOS && Op1 != TOS) { // No operand at TOS? + // We can choose to move either operand to the top of the stack. If one of + // the operands is killed by this instruction, we want that one so that we + // can update right on top of the old version. + if (KillsOp0) { + moveToTop(Op0, I); // Move dead operand to TOS. + TOS = Op0; + } else if (KillsOp1) { + moveToTop(Op1, I); + TOS = Op1; + } else { + // All of the operands are live after this instruction executes, so we + // cannot update on top of any operand. Because of this, we must + // duplicate one of the stack elements to the top. It doesn't matter + // which one we pick. + // + duplicateToTop(Op0, Dest, I); + Op0 = TOS = Dest; + KillsOp0 = true; + } + } else if (!KillsOp0 && !KillsOp1) { + // If we DO have one of our operands at the top of the stack, but we don't + // have a dead operand, we must duplicate one of the operands to a new slot + // on the stack. + duplicateToTop(Op0, Dest, I); + Op0 = TOS = Dest; + KillsOp0 = true; + } + + // Now we know that one of our operands is on the top of the stack, and at + // least one of our operands is killed by this instruction. + assert((TOS == Op0 || TOS == Op1) && (KillsOp0 || KillsOp1) && + "Stack conditions not set up right!"); + + // We decide which form to use based on what is on the top of the stack, and + // which operand is killed by this instruction. + ArrayRef<TableEntry> InstTable; + bool isForward = TOS == Op0; + bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0); + if (updateST0) { + if (isForward) + InstTable = ForwardST0Table; + else + InstTable = ReverseST0Table; + } else { + if (isForward) + InstTable = ForwardSTiTable; + else + InstTable = ReverseSTiTable; + } + + int Opcode = Lookup(InstTable, MI->getOpcode()); + assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!"); + + // NotTOS - The register which is not on the top of stack... + unsigned NotTOS = (TOS == Op0) ? Op1 : Op0; + + // Replace the old instruction with a new instruction + MBB->remove(I++); + I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS)); + + // If both operands are killed, pop one off of the stack in addition to + // overwriting the other one. + if (KillsOp0 && KillsOp1 && Op0 != Op1) { + assert(!updateST0 && "Should have updated other operand!"); + popStackAfter(I); // Pop the top of stack + } + + // Update stack information so that we know the destination register is now on + // the stack. + unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS); + assert(UpdatedSlot < StackTop && Dest < 7); + Stack[UpdatedSlot] = Dest; + RegMap[Dest] = UpdatedSlot; + MBB->getParent()->DeleteMachineInstr(MI); // Remove the old instruction +} + +/// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP +/// register arguments and no explicit destinations. +/// +void FPS::handleCompareFP(MachineBasicBlock::iterator &I) { + ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table); + ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable); + MachineInstr *MI = I; + + unsigned NumOperands = MI->getDesc().getNumOperands(); + assert(NumOperands == 2 && "Illegal FUCOM* instruction!"); + unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2)); + unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1)); + bool KillsOp0 = MI->killsRegister(X86::FP0+Op0); + bool KillsOp1 = MI->killsRegister(X86::FP0+Op1); + + // Make sure the first operand is on the top of stack, the other one can be + // anywhere. + moveToTop(Op0, I); + + // Change from the pseudo instruction to the concrete instruction. + MI->getOperand(0).setReg(getSTReg(Op1)); + MI->RemoveOperand(1); + MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode()))); + + // If any of the operands are killed by this instruction, free them. + if (KillsOp0) freeStackSlotAfter(I, Op0); + if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1); +} + +/// handleCondMovFP - Handle two address conditional move instructions. These +/// instructions move a st(i) register to st(0) iff a condition is true. These +/// instructions require that the first operand is at the top of the stack, but +/// otherwise don't modify the stack at all. +void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) { + MachineInstr *MI = I; + + unsigned Op0 = getFPReg(MI->getOperand(0)); + unsigned Op1 = getFPReg(MI->getOperand(2)); + bool KillsOp1 = MI->killsRegister(X86::FP0+Op1); + + // The first operand *must* be on the top of the stack. + moveToTop(Op0, I); + + // Change the second operand to the stack register that the operand is in. + // Change from the pseudo instruction to the concrete instruction. + MI->RemoveOperand(0); + MI->RemoveOperand(1); + MI->getOperand(0).setReg(getSTReg(Op1)); + MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode()))); + + // If we kill the second operand, make sure to pop it from the stack. + if (Op0 != Op1 && KillsOp1) { + // Get this value off of the register stack. + freeStackSlotAfter(I, Op1); + } +} + + +/// handleSpecialFP - Handle special instructions which behave unlike other +/// floating point instructions. This is primarily intended for use by pseudo +/// instructions. +/// +void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) { + MachineInstr *MI = Inst; + + if (MI->isCall()) { + handleCall(Inst); + return; + } + + switch (MI->getOpcode()) { + default: llvm_unreachable("Unknown SpecialFP instruction!"); + case TargetOpcode::COPY: { + // We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP. + const MachineOperand &MO1 = MI->getOperand(1); + const MachineOperand &MO0 = MI->getOperand(0); + bool KillsSrc = MI->killsRegister(MO1.getReg()); + + // FP <- FP copy. + unsigned DstFP = getFPReg(MO0); + unsigned SrcFP = getFPReg(MO1); + assert(isLive(SrcFP) && "Cannot copy dead register"); + if (KillsSrc) { + // If the input operand is killed, we can just change the owner of the + // incoming stack slot into the result. + unsigned Slot = getSlot(SrcFP); + Stack[Slot] = DstFP; + RegMap[DstFP] = Slot; + } else { + // For COPY we just duplicate the specified value to a new stack slot. + // This could be made better, but would require substantial changes. + duplicateToTop(SrcFP, DstFP, Inst); + } + break; + } + + case TargetOpcode::IMPLICIT_DEF: { + // All FP registers must be explicitly defined, so load a 0 instead. + unsigned Reg = MI->getOperand(0).getReg() - X86::FP0; + DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n'); + BuildMI(*MBB, Inst, MI->getDebugLoc(), TII->get(X86::LD_F0)); + pushReg(Reg); + break; + } + + case TargetOpcode::INLINEASM: { + // The inline asm MachineInstr currently only *uses* FP registers for the + // 'f' constraint. These should be turned into the current ST(x) register + // in the machine instr. + // + // There are special rules for x87 inline assembly. The compiler must know + // exactly how many registers are popped and pushed implicitly by the asm. + // Otherwise it is not possible to restore the stack state after the inline + // asm. + // + // There are 3 kinds of input operands: + // + // 1. Popped inputs. These must appear at the stack top in ST0-STn. A + // popped input operand must be in a fixed stack slot, and it is either + // tied to an output operand, or in the clobber list. The MI has ST use + // and def operands for these inputs. + // + // 2. Fixed inputs. These inputs appear in fixed stack slots, but are + // preserved by the inline asm. The fixed stack slots must be STn-STm + // following the popped inputs. A fixed input operand cannot be tied to + // an output or appear in the clobber list. The MI has ST use operands + // and no defs for these inputs. + // + // 3. Preserved inputs. These inputs use the "f" constraint which is + // represented as an FP register. The inline asm won't change these + // stack slots. + // + // Outputs must be in ST registers, FP outputs are not allowed. Clobbered + // registers do not count as output operands. The inline asm changes the + // stack as if it popped all the popped inputs and then pushed all the + // output operands. + + // Scan the assembly for ST registers used, defined and clobbered. We can + // only tell clobbers from defs by looking at the asm descriptor. + unsigned STUses = 0, STDefs = 0, STClobbers = 0, STDeadDefs = 0; + unsigned NumOps = 0; + SmallSet<unsigned, 1> FRegIdx; + unsigned RCID; + + for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI->getNumOperands(); + i != e && MI->getOperand(i).isImm(); i += 1 + NumOps) { + unsigned Flags = MI->getOperand(i).getImm(); + + NumOps = InlineAsm::getNumOperandRegisters(Flags); + if (NumOps != 1) + continue; + const MachineOperand &MO = MI->getOperand(i + 1); + if (!MO.isReg()) + continue; + unsigned STReg = MO.getReg() - X86::FP0; + if (STReg >= 8) + continue; + + // If the flag has a register class constraint, this must be an operand + // with constraint "f". Record its index and continue. + if (InlineAsm::hasRegClassConstraint(Flags, RCID)) { + FRegIdx.insert(i + 1); + continue; + } + + switch (InlineAsm::getKind(Flags)) { + case InlineAsm::Kind_RegUse: + STUses |= (1u << STReg); + break; + case InlineAsm::Kind_RegDef: + case InlineAsm::Kind_RegDefEarlyClobber: + STDefs |= (1u << STReg); + if (MO.isDead()) + STDeadDefs |= (1u << STReg); + break; + case InlineAsm::Kind_Clobber: + STClobbers |= (1u << STReg); + break; + default: + break; + } + } + + if (STUses && !isMask_32(STUses)) + MI->emitError("fixed input regs must be last on the x87 stack"); + unsigned NumSTUses = countTrailingOnes(STUses); + + // Defs must be contiguous from the stack top. ST0-STn. + if (STDefs && !isMask_32(STDefs)) { + MI->emitError("output regs must be last on the x87 stack"); + STDefs = NextPowerOf2(STDefs) - 1; + } + unsigned NumSTDefs = countTrailingOnes(STDefs); + + // So must the clobbered stack slots. ST0-STm, m >= n. + if (STClobbers && !isMask_32(STDefs | STClobbers)) + MI->emitError("clobbers must be last on the x87 stack"); + + // Popped inputs are the ones that are also clobbered or defined. + unsigned STPopped = STUses & (STDefs | STClobbers); + if (STPopped && !isMask_32(STPopped)) + MI->emitError("implicitly popped regs must be last on the x87 stack"); + unsigned NumSTPopped = countTrailingOnes(STPopped); + + DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops " + << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n"); + +#ifndef NDEBUG + // If any input operand uses constraint "f", all output register + // constraints must be early-clobber defs. + for (unsigned I = 0, E = MI->getNumOperands(); I < E; ++I) + if (FRegIdx.count(I)) { + assert((1 << getFPReg(MI->getOperand(I)) & STDefs) == 0 && + "Operands with constraint \"f\" cannot overlap with defs"); + } +#endif + + // Collect all FP registers (register operands with constraints "t", "u", + // and "f") to kill afer the instruction. + unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &Op = MI->getOperand(i); + if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6) + continue; + unsigned FPReg = getFPReg(Op); + + // If we kill this operand, make sure to pop it from the stack after the + // asm. We just remember it for now, and pop them all off at the end in + // a batch. + if (Op.isUse() && Op.isKill()) + FPKills |= 1U << FPReg; + } + + // Do not include registers that are implicitly popped by defs/clobbers. + FPKills &= ~(STDefs | STClobbers); + + // Now we can rearrange the live registers to match what was requested. + unsigned char STUsesArray[8]; + + for (unsigned I = 0; I < NumSTUses; ++I) + STUsesArray[I] = I; + + shuffleStackTop(STUsesArray, NumSTUses, Inst); + DEBUG({dbgs() << "Before asm: "; dumpStack();}); + + // With the stack layout fixed, rewrite the FP registers. + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &Op = MI->getOperand(i); + if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6) + continue; + + unsigned FPReg = getFPReg(Op); + + if (FRegIdx.count(i)) + // Operand with constraint "f". + Op.setReg(getSTReg(FPReg)); + else + // Operand with a single register class constraint ("t" or "u"). + Op.setReg(X86::ST0 + FPReg); + } + + // Simulate the inline asm popping its inputs and pushing its outputs. + StackTop -= NumSTPopped; + + for (unsigned i = 0; i < NumSTDefs; ++i) + pushReg(NumSTDefs - i - 1); + + // If this asm kills any FP registers (is the last use of them) we must + // explicitly emit pop instructions for them. Do this now after the asm has + // executed so that the ST(x) numbers are not off (which would happen if we + // did this inline with operand rewriting). + // + // Note: this might be a non-optimal pop sequence. We might be able to do + // better by trying to pop in stack order or something. + while (FPKills) { + unsigned FPReg = countTrailingZeros(FPKills); + if (isLive(FPReg)) + freeStackSlotAfter(Inst, FPReg); + FPKills &= ~(1U << FPReg); + } + + // Don't delete the inline asm! + return; + } + + case X86::RETQ: + case X86::RETL: + case X86::RETIL: + case X86::RETIQ: + // If RET has an FP register use operand, pass the first one in ST(0) and + // the second one in ST(1). + + // Find the register operands. + unsigned FirstFPRegOp = ~0U, SecondFPRegOp = ~0U; + unsigned LiveMask = 0; + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &Op = MI->getOperand(i); + if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6) + continue; + // FP Register uses must be kills unless there are two uses of the same + // register, in which case only one will be a kill. + assert(Op.isUse() && + (Op.isKill() || // Marked kill. + getFPReg(Op) == FirstFPRegOp || // Second instance. + MI->killsRegister(Op.getReg())) && // Later use is marked kill. + "Ret only defs operands, and values aren't live beyond it"); + + if (FirstFPRegOp == ~0U) + FirstFPRegOp = getFPReg(Op); + else { + assert(SecondFPRegOp == ~0U && "More than two fp operands!"); + SecondFPRegOp = getFPReg(Op); + } + LiveMask |= (1 << getFPReg(Op)); + + // Remove the operand so that later passes don't see it. + MI->RemoveOperand(i); + --i, --e; + } + + // We may have been carrying spurious live-ins, so make sure only the returned + // registers are left live. + adjustLiveRegs(LiveMask, MI); + if (!LiveMask) return; // Quick check to see if any are possible. + + // There are only four possibilities here: + // 1) we are returning a single FP value. In this case, it has to be in + // ST(0) already, so just declare success by removing the value from the + // FP Stack. + if (SecondFPRegOp == ~0U) { + // Assert that the top of stack contains the right FP register. + assert(StackTop == 1 && FirstFPRegOp == getStackEntry(0) && + "Top of stack not the right register for RET!"); + + // Ok, everything is good, mark the value as not being on the stack + // anymore so that our assertion about the stack being empty at end of + // block doesn't fire. + StackTop = 0; + return; + } + + // Otherwise, we are returning two values: + // 2) If returning the same value for both, we only have one thing in the FP + // stack. Consider: RET FP1, FP1 + if (StackTop == 1) { + assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&& + "Stack misconfiguration for RET!"); + + // Duplicate the TOS so that we return it twice. Just pick some other FPx + // register to hold it. + unsigned NewReg = ScratchFPReg; + duplicateToTop(FirstFPRegOp, NewReg, MI); + FirstFPRegOp = NewReg; + } + + /// Okay we know we have two different FPx operands now: + assert(StackTop == 2 && "Must have two values live!"); + + /// 3) If SecondFPRegOp is currently in ST(0) and FirstFPRegOp is currently + /// in ST(1). In this case, emit an fxch. + if (getStackEntry(0) == SecondFPRegOp) { + assert(getStackEntry(1) == FirstFPRegOp && "Unknown regs live"); + moveToTop(FirstFPRegOp, MI); + } + + /// 4) Finally, FirstFPRegOp must be in ST(0) and SecondFPRegOp must be in + /// ST(1). Just remove both from our understanding of the stack and return. + assert(getStackEntry(0) == FirstFPRegOp && "Unknown regs live"); + assert(getStackEntry(1) == SecondFPRegOp && "Unknown regs live"); + StackTop = 0; + return; + } + + Inst = MBB->erase(Inst); // Remove the pseudo instruction + + // We want to leave I pointing to the previous instruction, but what if we + // just erased the first instruction? + if (Inst == MBB->begin()) { + DEBUG(dbgs() << "Inserting dummy KILL\n"); + Inst = BuildMI(*MBB, Inst, DebugLoc(), TII->get(TargetOpcode::KILL)); + } else + --Inst; +} + +void FPS::setKillFlags(MachineBasicBlock &MBB) const { + const TargetRegisterInfo *TRI = + MBB.getParent()->getSubtarget().getRegisterInfo(); + LivePhysRegs LPR(TRI); + + LPR.addLiveOuts(&MBB); + + for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); + I != E; ++I) { + if (I->isDebugValue()) + continue; + + std::bitset<8> Defs; + SmallVector<MachineOperand *, 2> Uses; + MachineInstr &MI = *I; + + for (auto &MO : I->operands()) { + if (!MO.isReg()) + continue; + + unsigned Reg = MO.getReg() - X86::FP0; + + if (Reg >= 8) + continue; + + if (MO.isDef()) { + Defs.set(Reg); + if (!LPR.contains(MO.getReg())) + MO.setIsDead(); + } else + Uses.push_back(&MO); + } + + for (auto *MO : Uses) + if (Defs.test(getFPReg(*MO)) || !LPR.contains(MO->getReg())) + MO->setIsKill(); + + LPR.stepBackward(MI); + } +} diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp new file mode 100644 index 0000000..8b5fd27 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -0,0 +1,2698 @@ +//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of TargetFrameLowering class. +// +//===----------------------------------------------------------------------===// + +#include "X86FrameLowering.h" +#include "X86InstrBuilder.h" +#include "X86InstrInfo.h" +#include "X86MachineFunctionInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/EHPersonalities.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/WinEHFuncInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Support/Debug.h" +#include <cstdlib> + +using namespace llvm; + +X86FrameLowering::X86FrameLowering(const X86Subtarget &STI, + unsigned StackAlignOverride) + : TargetFrameLowering(StackGrowsDown, StackAlignOverride, + STI.is64Bit() ? -8 : -4), + STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) { + // Cache a bunch of frame-related predicates for this subtarget. + SlotSize = TRI->getSlotSize(); + Is64Bit = STI.is64Bit(); + IsLP64 = STI.isTarget64BitLP64(); + // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. + Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); + StackPtr = TRI->getStackRegister(); +} + +bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { + return !MF.getFrameInfo()->hasVarSizedObjects() && + !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences(); +} + +/// canSimplifyCallFramePseudos - If there is a reserved call frame, the +/// call frame pseudos can be simplified. Having a FP, as in the default +/// implementation, is not sufficient here since we can't always use it. +/// Use a more nuanced condition. +bool +X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { + return hasReservedCallFrame(MF) || + (hasFP(MF) && !TRI->needsStackRealignment(MF)) || + TRI->hasBasePointer(MF); +} + +// needsFrameIndexResolution - Do we need to perform FI resolution for +// this function. Normally, this is required only when the function +// has any stack objects. However, FI resolution actually has another job, +// not apparent from the title - it resolves callframesetup/destroy +// that were not simplified earlier. +// So, this is required for x86 functions that have push sequences even +// when there are no stack objects. +bool +X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const { + return MF.getFrameInfo()->hasStackObjects() || + MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences(); +} + +/// hasFP - Return true if the specified function should have a dedicated frame +/// pointer register. This is true if the function has variable sized allocas +/// or if frame pointer elimination is disabled. +bool X86FrameLowering::hasFP(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineModuleInfo &MMI = MF.getMMI(); + + return (MF.getTarget().Options.DisableFramePointerElim(MF) || + TRI->needsStackRealignment(MF) || + MFI->hasVarSizedObjects() || + MFI->isFrameAddressTaken() || MFI->hasOpaqueSPAdjustment() || + MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() || + MMI.callsUnwindInit() || MMI.hasEHFunclets() || MMI.callsEHReturn() || + MFI->hasStackMap() || MFI->hasPatchPoint()); +} + +static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) { + if (IsLP64) { + if (isInt<8>(Imm)) + return X86::SUB64ri8; + return X86::SUB64ri32; + } else { + if (isInt<8>(Imm)) + return X86::SUB32ri8; + return X86::SUB32ri; + } +} + +static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) { + if (IsLP64) { + if (isInt<8>(Imm)) + return X86::ADD64ri8; + return X86::ADD64ri32; + } else { + if (isInt<8>(Imm)) + return X86::ADD32ri8; + return X86::ADD32ri; + } +} + +static unsigned getSUBrrOpcode(unsigned isLP64) { + return isLP64 ? X86::SUB64rr : X86::SUB32rr; +} + +static unsigned getADDrrOpcode(unsigned isLP64) { + return isLP64 ? X86::ADD64rr : X86::ADD32rr; +} + +static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) { + if (IsLP64) { + if (isInt<8>(Imm)) + return X86::AND64ri8; + return X86::AND64ri32; + } + if (isInt<8>(Imm)) + return X86::AND32ri8; + return X86::AND32ri; +} + +static unsigned getLEArOpcode(unsigned IsLP64) { + return IsLP64 ? X86::LEA64r : X86::LEA32r; +} + +/// findDeadCallerSavedReg - Return a caller-saved register that isn't live +/// when it reaches the "return" instruction. We can then pop a stack object +/// to this register without worry about clobbering it. +static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const X86RegisterInfo *TRI, + bool Is64Bit) { + const MachineFunction *MF = MBB.getParent(); + const Function *F = MF->getFunction(); + if (!F || MF->getMMI().callsEHReturn()) + return 0; + + const TargetRegisterClass &AvailableRegs = *TRI->getGPRsForTailCall(*MF); + + unsigned Opc = MBBI->getOpcode(); + switch (Opc) { + default: return 0; + case X86::RETL: + case X86::RETQ: + case X86::RETIL: + case X86::RETIQ: + case X86::TCRETURNdi: + case X86::TCRETURNri: + case X86::TCRETURNmi: + case X86::TCRETURNdi64: + case X86::TCRETURNri64: + case X86::TCRETURNmi64: + case X86::EH_RETURN: + case X86::EH_RETURN64: { + SmallSet<uint16_t, 8> Uses; + for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MBBI->getOperand(i); + if (!MO.isReg() || MO.isDef()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + Uses.insert(*AI); + } + + for (auto CS : AvailableRegs) + if (!Uses.count(CS) && CS != X86::RIP) + return CS; + } + } + + return 0; +} + +static bool isEAXLiveIn(MachineFunction &MF) { + for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(), + EE = MF.getRegInfo().livein_end(); II != EE; ++II) { + unsigned Reg = II->first; + + if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX || + Reg == X86::AH || Reg == X86::AL) + return true; + } + + return false; +} + +/// Check if the flags need to be preserved before the terminators. +/// This would be the case, if the eflags is live-in of the region +/// composed by the terminators or live-out of that region, without +/// being defined by a terminator. +static bool +flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) { + for (const MachineInstr &MI : MBB.terminators()) { + bool BreakNext = false; + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (Reg != X86::EFLAGS) + continue; + + // This terminator needs an eflags that is not defined + // by a previous another terminator: + // EFLAGS is live-in of the region composed by the terminators. + if (!MO.isDef()) + return true; + // This terminator defines the eflags, i.e., we don't need to preserve it. + // However, we still need to check this specific terminator does not + // read a live-in value. + BreakNext = true; + } + // We found a definition of the eflags, no need to preserve them. + if (BreakNext) + return false; + } + + // None of the terminators use or define the eflags. + // Check if they are live-out, that would imply we need to preserve them. + for (const MachineBasicBlock *Succ : MBB.successors()) + if (Succ->isLiveIn(X86::EFLAGS)) + return true; + + return false; +} + +/// emitSPUpdate - Emit a series of instructions to increment / decrement the +/// stack pointer by a constant value. +void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + int64_t NumBytes, bool InEpilogue) const { + bool isSub = NumBytes < 0; + uint64_t Offset = isSub ? -NumBytes : NumBytes; + + uint64_t Chunk = (1LL << 31) - 1; + DebugLoc DL = MBB.findDebugLoc(MBBI); + + while (Offset) { + if (Offset > Chunk) { + // Rather than emit a long series of instructions for large offsets, + // load the offset into a register and do one sub/add + unsigned Reg = 0; + + if (isSub && !isEAXLiveIn(*MBB.getParent())) + Reg = (unsigned)(Is64Bit ? X86::RAX : X86::EAX); + else + Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); + + if (Reg) { + unsigned Opc = Is64Bit ? X86::MOV64ri : X86::MOV32ri; + BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg) + .addImm(Offset); + Opc = isSub + ? getSUBrrOpcode(Is64Bit) + : getADDrrOpcode(Is64Bit); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addReg(Reg); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + Offset = 0; + continue; + } + } + + uint64_t ThisVal = std::min(Offset, Chunk); + if (ThisVal == (Is64Bit ? 8 : 4)) { + // Use push / pop instead. + unsigned Reg = isSub + ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX) + : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); + if (Reg) { + unsigned Opc = isSub + ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r) + : (Is64Bit ? X86::POP64r : X86::POP32r); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc)) + .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)); + if (isSub) + MI->setFlag(MachineInstr::FrameSetup); + else + MI->setFlag(MachineInstr::FrameDestroy); + Offset -= ThisVal; + continue; + } + } + + MachineInstrBuilder MI = BuildStackAdjustment( + MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue); + if (isSub) + MI.setMIFlag(MachineInstr::FrameSetup); + else + MI.setMIFlag(MachineInstr::FrameDestroy); + + Offset -= ThisVal; + } +} + +MachineInstrBuilder X86FrameLowering::BuildStackAdjustment( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, + int64_t Offset, bool InEpilogue) const { + assert(Offset != 0 && "zero offset stack adjustment requested"); + + // On Atom, using LEA to adjust SP is preferred, but using it in the epilogue + // is tricky. + bool UseLEA; + if (!InEpilogue) { + // Check if inserting the prologue at the beginning + // of MBB would require to use LEA operations. + // We need to use LEA operations if EFLAGS is live in, because + // it means an instruction will read it before it gets defined. + UseLEA = STI.useLeaForSP() || MBB.isLiveIn(X86::EFLAGS); + } else { + // If we can use LEA for SP but we shouldn't, check that none + // of the terminators uses the eflags. Otherwise we will insert + // a ADD that will redefine the eflags and break the condition. + // Alternatively, we could move the ADD, but this may not be possible + // and is an optimization anyway. + UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent()); + if (UseLEA && !STI.useLeaForSP()) + UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB); + // If that assert breaks, that means we do not do the right thing + // in canUseAsEpilogue. + assert((UseLEA || !flagsNeedToBePreservedBeforeTheTerminators(MBB)) && + "We shouldn't have allowed this insertion point"); + } + + MachineInstrBuilder MI; + if (UseLEA) { + MI = addRegOffset(BuildMI(MBB, MBBI, DL, + TII.get(getLEArOpcode(Uses64BitFramePtr)), + StackPtr), + StackPtr, false, Offset); + } else { + bool IsSub = Offset < 0; + uint64_t AbsOffset = IsSub ? -Offset : Offset; + unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset) + : getADDriOpcode(Uses64BitFramePtr, AbsOffset); + MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(AbsOffset); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + } + return MI; +} + +int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + bool doMergeWithPrevious) const { + if ((doMergeWithPrevious && MBBI == MBB.begin()) || + (!doMergeWithPrevious && MBBI == MBB.end())) + return 0; + + MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI; + MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr + : std::next(MBBI); + unsigned Opc = PI->getOpcode(); + int Offset = 0; + + if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || + Opc == X86::ADD32ri || Opc == X86::ADD32ri8 || + Opc == X86::LEA32r || Opc == X86::LEA64_32r) && + PI->getOperand(0).getReg() == StackPtr){ + Offset += PI->getOperand(2).getImm(); + MBB.erase(PI); + if (!doMergeWithPrevious) MBBI = NI; + } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || + Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && + PI->getOperand(0).getReg() == StackPtr) { + Offset -= PI->getOperand(2).getImm(); + MBB.erase(PI); + if (!doMergeWithPrevious) MBBI = NI; + } + + return Offset; +} + +void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + MCCFIInstruction CFIInst) const { + MachineFunction &MF = *MBB.getParent(); + unsigned CFIIndex = MF.getMMI().addFrameInst(CFIInst); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); +} + +void +X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL) const { + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineModuleInfo &MMI = MF.getMMI(); + const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); + + // Add callee saved registers to move list. + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + if (CSI.empty()) return; + + // Calculate offsets. + for (std::vector<CalleeSavedInfo>::const_iterator + I = CSI.begin(), E = CSI.end(); I != E; ++I) { + int64_t Offset = MFI->getObjectOffset(I->getFrameIdx()); + unsigned Reg = I->getReg(); + + unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); + BuildCFI(MBB, MBBI, DL, + MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); + } +} + +MachineInstr *X86FrameLowering::emitStackProbe(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool InProlog) const { + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + if (STI.isTargetWindowsCoreCLR()) { + if (InProlog) { + return emitStackProbeInlineStub(MF, MBB, MBBI, DL, true); + } else { + return emitStackProbeInline(MF, MBB, MBBI, DL, false); + } + } else { + return emitStackProbeCall(MF, MBB, MBBI, DL, InProlog); + } +} + +void X86FrameLowering::inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologMBB) const { + const StringRef ChkStkStubSymbol = "__chkstk_stub"; + MachineInstr *ChkStkStub = nullptr; + + for (MachineInstr &MI : PrologMBB) { + if (MI.isCall() && MI.getOperand(0).isSymbol() && + ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) { + ChkStkStub = &MI; + break; + } + } + + if (ChkStkStub != nullptr) { + MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator()); + assert(std::prev(MBBI).operator==(ChkStkStub) && + "MBBI expected after __chkstk_stub."); + DebugLoc DL = PrologMBB.findDebugLoc(MBBI); + emitStackProbeInline(MF, PrologMBB, MBBI, DL, true); + ChkStkStub->eraseFromParent(); + } +} + +MachineInstr *X86FrameLowering::emitStackProbeInline( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const { + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + assert(STI.is64Bit() && "different expansion needed for 32 bit"); + assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR"); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + const BasicBlock *LLVM_BB = MBB.getBasicBlock(); + + // RAX contains the number of bytes of desired stack adjustment. + // The handling here assumes this value has already been updated so as to + // maintain stack alignment. + // + // We need to exit with RSP modified by this amount and execute suitable + // page touches to notify the OS that we're growing the stack responsibly. + // All stack probing must be done without modifying RSP. + // + // MBB: + // SizeReg = RAX; + // ZeroReg = 0 + // CopyReg = RSP + // Flags, TestReg = CopyReg - SizeReg + // FinalReg = !Flags.Ovf ? TestReg : ZeroReg + // LimitReg = gs magic thread env access + // if FinalReg >= LimitReg goto ContinueMBB + // RoundBB: + // RoundReg = page address of FinalReg + // LoopMBB: + // LoopReg = PHI(LimitReg,ProbeReg) + // ProbeReg = LoopReg - PageSize + // [ProbeReg] = 0 + // if (ProbeReg > RoundReg) goto LoopMBB + // ContinueMBB: + // RSP = RSP - RAX + // [rest of original MBB] + + // Set up the new basic blocks + MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB); + + MachineFunction::iterator MBBIter = std::next(MBB.getIterator()); + MF.insert(MBBIter, RoundMBB); + MF.insert(MBBIter, LoopMBB); + MF.insert(MBBIter, ContinueMBB); + + // Split MBB and move the tail portion down to ContinueMBB. + MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI); + ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end()); + ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB); + + // Some useful constants + const int64_t ThreadEnvironmentStackLimit = 0x10; + const int64_t PageSize = 0x1000; + const int64_t PageMask = ~(PageSize - 1); + + // Registers we need. For the normal case we use virtual + // registers. For the prolog expansion we use RAX, RCX and RDX. + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetRegisterClass *RegClass = &X86::GR64RegClass; + const unsigned SizeReg = InProlog ? (unsigned)X86::RAX + : MRI.createVirtualRegister(RegClass), + ZeroReg = InProlog ? (unsigned)X86::RCX + : MRI.createVirtualRegister(RegClass), + CopyReg = InProlog ? (unsigned)X86::RDX + : MRI.createVirtualRegister(RegClass), + TestReg = InProlog ? (unsigned)X86::RDX + : MRI.createVirtualRegister(RegClass), + FinalReg = InProlog ? (unsigned)X86::RDX + : MRI.createVirtualRegister(RegClass), + RoundedReg = InProlog ? (unsigned)X86::RDX + : MRI.createVirtualRegister(RegClass), + LimitReg = InProlog ? (unsigned)X86::RCX + : MRI.createVirtualRegister(RegClass), + JoinReg = InProlog ? (unsigned)X86::RCX + : MRI.createVirtualRegister(RegClass), + ProbeReg = InProlog ? (unsigned)X86::RCX + : MRI.createVirtualRegister(RegClass); + + // SP-relative offsets where we can save RCX and RDX. + int64_t RCXShadowSlot = 0; + int64_t RDXShadowSlot = 0; + + // If inlining in the prolog, save RCX and RDX. + // Future optimization: don't save or restore if not live in. + if (InProlog) { + // Compute the offsets. We need to account for things already + // pushed onto the stack at this point: return address, frame + // pointer (if used), and callee saves. + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize(); + const bool HasFP = hasFP(MF); + RCXShadowSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0); + RDXShadowSlot = RCXShadowSlot + 8; + // Emit the saves. + addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, + RCXShadowSlot) + .addReg(X86::RCX); + addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, + RDXShadowSlot) + .addReg(X86::RDX); + } else { + // Not in the prolog. Copy RAX to a virtual reg. + BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX); + } + + // Add code to MBB to check for overflow and set the new target stack pointer + // to zero if so. + BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg) + .addReg(ZeroReg, RegState::Undef) + .addReg(ZeroReg, RegState::Undef); + BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP); + BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg) + .addReg(CopyReg) + .addReg(SizeReg); + BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg) + .addReg(TestReg) + .addReg(ZeroReg); + + // FinalReg now holds final stack pointer value, or zero if + // allocation would overflow. Compare against the current stack + // limit from the thread environment block. Note this limit is the + // lowest touched page on the stack, not the point at which the OS + // will cause an overflow exception, so this is just an optimization + // to avoid unnecessarily touching pages that are below the current + // SP but already commited to the stack by the OS. + BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg) + .addReg(0) + .addImm(1) + .addReg(0) + .addImm(ThreadEnvironmentStackLimit) + .addReg(X86::GS); + BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg); + // Jump if the desired stack pointer is at or above the stack limit. + BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB); + + // Add code to roundMBB to round the final stack pointer to a page boundary. + BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg) + .addReg(FinalReg) + .addImm(PageMask); + BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB); + + // LimitReg now holds the current stack limit, RoundedReg page-rounded + // final RSP value. Add code to loopMBB to decrement LimitReg page-by-page + // and probe until we reach RoundedReg. + if (!InProlog) { + BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg) + .addReg(LimitReg) + .addMBB(RoundMBB) + .addReg(ProbeReg) + .addMBB(LoopMBB); + } + + addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg, + false, -PageSize); + + // Probe by storing a byte onto the stack. + BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi)) + .addReg(ProbeReg) + .addImm(1) + .addReg(0) + .addImm(0) + .addReg(0) + .addImm(0); + BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr)) + .addReg(RoundedReg) + .addReg(ProbeReg); + BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB); + + MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI(); + + // If in prolog, restore RDX and RCX. + if (InProlog) { + addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm), + X86::RCX), + X86::RSP, false, RCXShadowSlot); + addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm), + X86::RDX), + X86::RSP, false, RDXShadowSlot); + } + + // Now that the probing is done, add code to continueMBB to update + // the stack pointer for real. + BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP) + .addReg(X86::RSP) + .addReg(SizeReg); + + // Add the control flow edges we need. + MBB.addSuccessor(ContinueMBB); + MBB.addSuccessor(RoundMBB); + RoundMBB->addSuccessor(LoopMBB); + LoopMBB->addSuccessor(ContinueMBB); + LoopMBB->addSuccessor(LoopMBB); + + // Mark all the instructions added to the prolog as frame setup. + if (InProlog) { + for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) { + BeforeMBBI->setFlag(MachineInstr::FrameSetup); + } + for (MachineInstr &MI : *RoundMBB) { + MI.setFlag(MachineInstr::FrameSetup); + } + for (MachineInstr &MI : *LoopMBB) { + MI.setFlag(MachineInstr::FrameSetup); + } + for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin(); + CMBBI != ContinueMBBI; ++CMBBI) { + CMBBI->setFlag(MachineInstr::FrameSetup); + } + } + + // Possible TODO: physreg liveness for InProlog case. + + return ContinueMBBI; +} + +MachineInstr *X86FrameLowering::emitStackProbeCall( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const { + bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; + + unsigned CallOp; + if (Is64Bit) + CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32; + else + CallOp = X86::CALLpcrel32; + + const char *Symbol; + if (Is64Bit) { + if (STI.isTargetCygMing()) { + Symbol = "___chkstk_ms"; + } else { + Symbol = "__chkstk"; + } + } else if (STI.isTargetCygMing()) + Symbol = "_alloca"; + else + Symbol = "_chkstk"; + + MachineInstrBuilder CI; + MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI); + + // All current stack probes take AX and SP as input, clobber flags, and + // preserve all registers. x86_64 probes leave RSP unmodified. + if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) { + // For the large code model, we have to call through a register. Use R11, + // as it is scratch in all supported calling conventions. + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11) + .addExternalSymbol(Symbol); + CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11); + } else { + CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol); + } + + unsigned AX = Is64Bit ? X86::RAX : X86::EAX; + unsigned SP = Is64Bit ? X86::RSP : X86::ESP; + CI.addReg(AX, RegState::Implicit) + .addReg(SP, RegState::Implicit) + .addReg(AX, RegState::Define | RegState::Implicit) + .addReg(SP, RegState::Define | RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + + if (Is64Bit) { + // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp + // themselves. It also does not clobber %rax so we can reuse it when + // adjusting %rsp. + BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP) + .addReg(X86::RSP) + .addReg(X86::RAX); + } + + if (InProlog) { + // Apply the frame setup flag to all inserted instrs. + for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI) + ExpansionMBBI->setFlag(MachineInstr::FrameSetup); + } + + return MBBI; +} + +MachineInstr *X86FrameLowering::emitStackProbeInlineStub( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const { + + assert(InProlog && "ChkStkStub called outside prolog!"); + + BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32)) + .addExternalSymbol("__chkstk_stub"); + + return MBBI; +} + +static unsigned calculateSetFPREG(uint64_t SPAdjust) { + // Win64 ABI has a less restrictive limitation of 240; 128 works equally well + // and might require smaller successive adjustments. + const uint64_t Win64MaxSEHOffset = 128; + uint64_t SEHFrameOffset = std::min(SPAdjust, Win64MaxSEHOffset); + // Win64 ABI requires 16-byte alignment for the UWOP_SET_FPREG opcode. + return SEHFrameOffset & -16; +} + +// If we're forcing a stack realignment we can't rely on just the frame +// info, we need to know the ABI stack alignment as well in case we +// have a call out. Otherwise just make sure we have some alignment - we'll +// go with the minimum SlotSize. +uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. + unsigned StackAlign = getStackAlignment(); + if (MF.getFunction()->hasFnAttribute("stackrealign")) { + if (MFI->hasCalls()) + MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; + else if (MaxAlign < SlotSize) + MaxAlign = SlotSize; + } + return MaxAlign; +} + +void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, unsigned Reg, + uint64_t MaxAlign) const { + uint64_t Val = -MaxAlign; + unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg) + .addReg(Reg) + .addImm(Val) + .setMIFlag(MachineInstr::FrameSetup); + + // The EFLAGS implicit def is dead. + MI->getOperand(3).setIsDead(); +} + +/// emitPrologue - Push callee-saved registers onto the stack, which +/// automatically adjust the stack pointer. Adjust the stack pointer to allocate +/// space for local variables. Also emit labels used by the exception handler to +/// generate the exception handling frames. + +/* + Here's a gist of what gets emitted: + + ; Establish frame pointer, if needed + [if needs FP] + push %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + .seh_pushreg %rpb + mov %rsp, %rbp + .cfi_def_cfa_register %rbp + + ; Spill general-purpose registers + [for all callee-saved GPRs] + pushq %<reg> + [if not needs FP] + .cfi_def_cfa_offset (offset from RETADDR) + .seh_pushreg %<reg> + + ; If the required stack alignment > default stack alignment + ; rsp needs to be re-aligned. This creates a "re-alignment gap" + ; of unknown size in the stack frame. + [if stack needs re-alignment] + and $MASK, %rsp + + ; Allocate space for locals + [if target is Windows and allocated space > 4096 bytes] + ; Windows needs special care for allocations larger + ; than one page. + mov $NNN, %rax + call ___chkstk_ms/___chkstk + sub %rax, %rsp + [else] + sub $NNN, %rsp + + [if needs FP] + .seh_stackalloc (size of XMM spill slots) + .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots + [else] + .seh_stackalloc NNN + + ; Spill XMMs + ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved, + ; they may get spilled on any platform, if the current function + ; calls @llvm.eh.unwind.init + [if needs FP] + [for all callee-saved XMM registers] + movaps %<xmm reg>, -MMM(%rbp) + [for all callee-saved XMM registers] + .seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset) + ; i.e. the offset relative to (%rbp - SEHFrameOffset) + [else] + [for all callee-saved XMM registers] + movaps %<xmm reg>, KKK(%rsp) + [for all callee-saved XMM registers] + .seh_savexmm %<xmm reg>, KKK + + .seh_endprologue + + [if needs base pointer] + mov %rsp, %rbx + [if needs to restore base pointer] + mov %rsp, -MMM(%rbp) + + ; Emit CFI info + [if needs FP] + [for all callee-saved registers] + .cfi_offset %<reg>, (offset from %rbp) + [else] + .cfi_def_cfa_offset (offset from RETADDR) + [for all callee-saved registers] + .cfi_offset %<reg>, (offset from %rsp) + + Notes: + - .seh directives are emitted only for Windows 64 ABI + - .cfi directives are emitted for all other ABIs + - for 32-bit code, substitute %e?? registers for %r?? +*/ + +void X86FrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + assert(&STI == &MF.getSubtarget<X86Subtarget>() && + "MF used frame lowering for wrong subtarget"); + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + const Function *Fn = MF.getFunction(); + MachineModuleInfo &MMI = MF.getMMI(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment. + uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. + bool IsFunclet = MBB.isEHFuncletEntry(); + EHPersonality Personality = EHPersonality::Unknown; + if (Fn->hasPersonalityFn()) + Personality = classifyEHPersonality(Fn->getPersonalityFn()); + bool FnHasClrFunclet = + MMI.hasEHFunclets() && Personality == EHPersonality::CoreCLR; + bool IsClrFunclet = IsFunclet && FnHasClrFunclet; + bool HasFP = hasFP(MF); + bool IsWin64CC = STI.isCallingConvWin64(Fn->getCallingConv()); + bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool NeedsWinCFI = IsWin64Prologue && Fn->needsUnwindTableEntry(); + bool NeedsDwarfCFI = + !IsWin64Prologue && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); + unsigned FramePtr = TRI->getFrameRegister(MF); + const unsigned MachineFramePtr = + STI.isTarget64BitILP32() + ? getX86SubSuperRegister(FramePtr, 64) : FramePtr; + unsigned BasePtr = TRI->getBaseRegister(); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc DL; + + // Add RETADDR move area to callee saved frame size. + int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + if (TailCallReturnAddrDelta && IsWin64Prologue) + report_fatal_error("Can't handle guaranteed tail call under win64 yet"); + + if (TailCallReturnAddrDelta < 0) + X86FI->setCalleeSavedFrameSize( + X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); + + bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO()); + + // The default stack probe size is 4096 if the function has no stackprobesize + // attribute. + unsigned StackProbeSize = 4096; + if (Fn->hasFnAttribute("stack-probe-size")) + Fn->getFnAttribute("stack-probe-size") + .getValueAsString() + .getAsInteger(0, StackProbeSize); + + // If this is x86-64 and the Red Zone is not disabled, if we are a leaf + // function, and use up to 128 bytes of stack space, don't have a frame + // pointer, calls, or dynamic alloca then we do not need to adjust the + // stack pointer (we fit in the Red Zone). We also check that we don't + // push and pop from the stack. + if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) && + !TRI->needsStackRealignment(MF) && + !MFI->hasVarSizedObjects() && // No dynamic alloca. + !MFI->adjustsStack() && // No calls. + !IsWin64CC && // Win64 has no Red Zone + !MFI->hasOpaqueSPAdjustment() && // Don't push and pop. + !MF.shouldSplitStack()) { // Regular stack + uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); + if (HasFP) MinSize += SlotSize; + StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); + MFI->setStackSize(StackSize); + } + + // Insert stack pointer adjustment for later moving of return addr. Only + // applies to tail call optimized functions where the callee argument stack + // size is bigger than the callers. + if (TailCallReturnAddrDelta < 0) { + BuildStackAdjustment(MBB, MBBI, DL, TailCallReturnAddrDelta, + /*InEpilogue=*/false) + .setMIFlag(MachineInstr::FrameSetup); + } + + // Mapping for machine moves: + // + // DST: VirtualFP AND + // SRC: VirtualFP => DW_CFA_def_cfa_offset + // ELSE => DW_CFA_def_cfa + // + // SRC: VirtualFP AND + // DST: Register => DW_CFA_def_cfa_register + // + // ELSE + // OFFSET < 0 => DW_CFA_offset_extended_sf + // REG < 64 => DW_CFA_offset + Reg + // ELSE => DW_CFA_offset_extended + + uint64_t NumBytes = 0; + int stackGrowth = -SlotSize; + + // Find the funclet establisher parameter + unsigned Establisher = X86::NoRegister; + if (IsClrFunclet) + Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX; + else if (IsFunclet) + Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX; + + if (IsWin64Prologue && IsFunclet && !IsClrFunclet) { + // Immediately spill establisher into the home slot. + // The runtime cares about this. + // MOV64mr %rdx, 16(%rsp) + unsigned MOVmr = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), StackPtr, true, 16) + .addReg(Establisher) + .setMIFlag(MachineInstr::FrameSetup); + MBB.addLiveIn(Establisher); + } + + if (HasFP) { + // Calculate required stack adjustment. + uint64_t FrameSize = StackSize - SlotSize; + // If required, include space for extra hidden slot for stashing base pointer. + if (X86FI->getRestoreBasePointer()) + FrameSize += SlotSize; + + NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); + + // Callee-saved registers are pushed on stack before the stack is realigned. + if (TRI->needsStackRealignment(MF) && !IsWin64Prologue) + NumBytes = RoundUpToAlignment(NumBytes, MaxAlign); + + // Get the offset of the stack slot for the EBP register, which is + // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. + // Update the frame offset adjustment. + if (!IsFunclet) + MFI->setOffsetAdjustment(-NumBytes); + else + assert(MFI->getOffsetAdjustment() == -(int)NumBytes && + "should calculate same local variable offset for funclets"); + + // Save EBP/RBP into the appropriate stack slot. + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) + .addReg(MachineFramePtr, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + + if (NeedsDwarfCFI) { + // Mark the place where EBP/RBP was saved. + // Define the current CFA rule to use the provided offset. + assert(StackSize); + BuildCFI(MBB, MBBI, DL, + MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth)); + + // Change the rule for the FramePtr to be an "offset" rule. + unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); + BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createOffset( + nullptr, DwarfFramePtr, 2 * stackGrowth)); + } + + if (NeedsWinCFI) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) + .addImm(FramePtr) + .setMIFlag(MachineInstr::FrameSetup); + } + + if (!IsWin64Prologue && !IsFunclet) { + // Update EBP with the new base value. + BuildMI(MBB, MBBI, DL, + TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), + FramePtr) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + + if (NeedsDwarfCFI) { + // Mark effective beginning of when frame pointer becomes valid. + // Define the current CFA to use the EBP/RBP register. + unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); + BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister( + nullptr, DwarfFramePtr)); + } + } + + // Mark the FramePtr as live-in in every block. Don't do this again for + // funclet prologues. + if (!IsFunclet) { + for (MachineBasicBlock &EveryMBB : MF) + EveryMBB.addLiveIn(MachineFramePtr); + } + } else { + assert(!IsFunclet && "funclets without FPs not yet implemented"); + NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); + } + + // For EH funclets, only allocate enough space for outgoing calls. Save the + // NumBytes value that we would've used for the parent frame. + unsigned ParentFrameNumBytes = NumBytes; + if (IsFunclet) + NumBytes = getWinEHFuncletFrameSize(MF); + + // Skip the callee-saved push instructions. + bool PushedRegs = false; + int StackOffset = 2 * stackGrowth; + + while (MBBI != MBB.end() && + MBBI->getFlag(MachineInstr::FrameSetup) && + (MBBI->getOpcode() == X86::PUSH32r || + MBBI->getOpcode() == X86::PUSH64r)) { + PushedRegs = true; + unsigned Reg = MBBI->getOperand(0).getReg(); + ++MBBI; + + if (!HasFP && NeedsDwarfCFI) { + // Mark callee-saved push instruction. + // Define the current CFA rule to use the provided offset. + assert(StackSize); + BuildCFI(MBB, MBBI, DL, + MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset)); + StackOffset += stackGrowth; + } + + if (NeedsWinCFI) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag( + MachineInstr::FrameSetup); + } + } + + // Realign stack after we pushed callee-saved registers (so that we'll be + // able to calculate their offsets from the frame pointer). + // Don't do this for Win64, it needs to realign the stack after the prologue. + if (!IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) { + assert(HasFP && "There should be a frame pointer if stack is realigned."); + BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign); + } + + // If there is an SUB32ri of ESP immediately before this instruction, merge + // the two. This can be the case when tail call elimination is enabled and + // the callee has more arguments then the caller. + NumBytes -= mergeSPUpdates(MBB, MBBI, true); + + // Adjust stack pointer: ESP -= numbytes. + + // Windows and cygwin/mingw require a prologue helper routine when allocating + // more than 4K bytes on the stack. Windows uses __chkstk and cygwin/mingw + // uses __alloca. __alloca and the 32-bit version of __chkstk will probe the + // stack and adjust the stack pointer in one go. The 64-bit version of + // __chkstk is only responsible for probing the stack. The 64-bit prologue is + // responsible for adjusting the stack pointer. Touching the stack at 4K + // increments is necessary to ensure that the guard pages used by the OS + // virtual memory manager are allocated in correct sequence. + uint64_t AlignedNumBytes = NumBytes; + if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) + AlignedNumBytes = RoundUpToAlignment(AlignedNumBytes, MaxAlign); + if (AlignedNumBytes >= StackProbeSize && UseStackProbe) { + // Check whether EAX is livein for this function. + bool isEAXAlive = isEAXLiveIn(MF); + + if (isEAXAlive) { + // Sanity check that EAX is not livein for this function. + // It should not be, so throw an assert. + assert(!Is64Bit && "EAX is livein in x64 case!"); + + // Save EAX + BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r)) + .addReg(X86::EAX, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + } + + if (Is64Bit) { + // Handle the 64-bit Windows ABI case where we need to call __chkstk. + // Function prologue is responsible for adjusting the stack pointer. + if (isUInt<32>(NumBytes)) { + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } else if (isInt<32>(NumBytes)) { + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } else { + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } + } else { + // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. + // We'll also use 4 already allocated bytes for EAX. + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) + .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } + + // Call __chkstk, __chkstk_ms, or __alloca. + emitStackProbe(MF, MBB, MBBI, DL, true); + + if (isEAXAlive) { + // Restore EAX + MachineInstr *MI = + addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX), + StackPtr, false, NumBytes - 4); + MI->setFlag(MachineInstr::FrameSetup); + MBB.insert(MBBI, MI); + } + } else if (NumBytes) { + emitSPUpdate(MBB, MBBI, -(int64_t)NumBytes, /*InEpilogue=*/false); + } + + if (NeedsWinCFI && NumBytes) + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + + int SEHFrameOffset = 0; + unsigned SPOrEstablisher; + if (IsFunclet) { + if (IsClrFunclet) { + // The establisher parameter passed to a CLR funclet is actually a pointer + // to the (mostly empty) frame of its nearest enclosing funclet; we have + // to find the root function establisher frame by loading the PSPSym from + // the intermediate frame. + unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF); + MachinePointerInfo NoInfo; + MBB.addLiveIn(Establisher); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher), + Establisher, false, PSPSlotOffset) + .addMemOperand(MF.getMachineMemOperand( + NoInfo, MachineMemOperand::MOLoad, SlotSize, SlotSize)); + ; + // Save the root establisher back into the current funclet's (mostly + // empty) frame, in case a sub-funclet or the GC needs it. + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, + false, PSPSlotOffset) + .addReg(Establisher) + .addMemOperand( + MF.getMachineMemOperand(NoInfo, MachineMemOperand::MOStore | + MachineMemOperand::MOVolatile, + SlotSize, SlotSize)); + } + SPOrEstablisher = Establisher; + } else { + SPOrEstablisher = StackPtr; + } + + if (IsWin64Prologue && HasFP) { + // Set RBP to a small fixed offset from RSP. In the funclet case, we base + // this calculation on the incoming establisher, which holds the value of + // RSP from the parent frame at the end of the prologue. + SEHFrameOffset = calculateSetFPREG(ParentFrameNumBytes); + if (SEHFrameOffset) + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr), + SPOrEstablisher, false, SEHFrameOffset); + else + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr) + .addReg(SPOrEstablisher); + + // If this is not a funclet, emit the CFI describing our frame pointer. + if (NeedsWinCFI && !IsFunclet) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) + .addImm(FramePtr) + .addImm(SEHFrameOffset) + .setMIFlag(MachineInstr::FrameSetup); + if (isAsynchronousEHPersonality(Personality)) + MF.getWinEHFuncInfo()->SEHSetFrameOffset = SEHFrameOffset; + } + } else if (IsFunclet && STI.is32Bit()) { + // Reset EBP / ESI to something good for funclets. + MBBI = restoreWin32EHStackPointers(MBB, MBBI, DL); + // If we're a catch funclet, we can be returned to via catchret. Save ESP + // into the registration node so that the runtime will restore it for us. + if (!MBB.isCleanupFuncletEntry()) { + assert(Personality == EHPersonality::MSVC_CXX); + unsigned FrameReg; + int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex; + int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg); + // ESP is the first field, so no extra displacement is needed. + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg, + false, EHRegOffset) + .addReg(X86::ESP); + } + } + + while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) { + const MachineInstr *FrameInstr = &*MBBI; + ++MBBI; + + if (NeedsWinCFI) { + int FI; + if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) { + if (X86::FR64RegClass.contains(Reg)) { + unsigned IgnoredFrameReg; + int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg); + Offset += SEHFrameOffset; + + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) + .addImm(Reg) + .addImm(Offset) + .setMIFlag(MachineInstr::FrameSetup); + } + } + } + } + + if (NeedsWinCFI) + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) + .setMIFlag(MachineInstr::FrameSetup); + + if (FnHasClrFunclet && !IsFunclet) { + // Save the so-called Initial-SP (i.e. the value of the stack pointer + // immediately after the prolog) into the PSPSlot so that funclets + // and the GC can recover it. + unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF); + auto PSPInfo = MachinePointerInfo::getFixedStack( + MF, MF.getWinEHFuncInfo()->PSPSymFrameIdx); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false, + PSPSlotOffset) + .addReg(StackPtr) + .addMemOperand(MF.getMachineMemOperand( + PSPInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile, + SlotSize, SlotSize)); + } + + // Realign stack after we spilled callee-saved registers (so that we'll be + // able to calculate their offsets from the frame pointer). + // Win64 requires aligning the stack after the prologue. + if (IsWin64Prologue && TRI->needsStackRealignment(MF)) { + assert(HasFP && "There should be a frame pointer if stack is realigned."); + BuildStackAlignAND(MBB, MBBI, DL, SPOrEstablisher, MaxAlign); + } + + // We already dealt with stack realignment and funclets above. + if (IsFunclet && STI.is32Bit()) + return; + + // If we need a base pointer, set it up here. It's whatever the value + // of the stack pointer is at this point. Any variable size objects + // will be allocated after this, so we can still use the base pointer + // to reference locals. + if (TRI->hasBasePointer(MF)) { + // Update the base pointer with the current stack pointer. + unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr; + BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) + .addReg(SPOrEstablisher) + .setMIFlag(MachineInstr::FrameSetup); + if (X86FI->getRestoreBasePointer()) { + // Stash value of base pointer. Saving RSP instead of EBP shortens + // dependence chain. Used by SjLj EH. + unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), + FramePtr, true, X86FI->getRestoreBasePointerOffset()) + .addReg(SPOrEstablisher) + .setMIFlag(MachineInstr::FrameSetup); + } + + if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) { + // Stash the value of the frame pointer relative to the base pointer for + // Win32 EH. This supports Win32 EH, which does the inverse of the above: + // it recovers the frame pointer from the base pointer rather than the + // other way around. + unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; + unsigned UsedReg; + int Offset = + getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg); + assert(UsedReg == BasePtr); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset) + .addReg(FramePtr) + .setMIFlag(MachineInstr::FrameSetup); + } + } + + if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) { + // Mark end of stack pointer adjustment. + if (!HasFP && NumBytes) { + // Define the current CFA rule to use the provided offset. + assert(StackSize); + BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset( + nullptr, -StackSize + stackGrowth)); + } + + // Emit DWARF info specifying the offsets of the callee-saved registers. + if (PushedRegs) + emitCalleeSavedFrameMoves(MBB, MBBI, DL); + } +} + +bool X86FrameLowering::canUseLEAForSPInEpilogue( + const MachineFunction &MF) const { + // We can't use LEA instructions for adjusting the stack pointer if this is a + // leaf function in the Win64 ABI. Only ADD instructions may be used to + // deallocate the stack. + // This means that we can use LEA for SP in two situations: + // 1. We *aren't* using the Win64 ABI which means we are free to use LEA. + // 2. We *have* a frame pointer which means we are permitted to use LEA. + return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF); +} + +static bool isFuncletReturnInstr(MachineInstr *MI) { + switch (MI->getOpcode()) { + case X86::CATCHRET: + case X86::CLEANUPRET: + return true; + default: + return false; + } + llvm_unreachable("impossible"); +} + +// CLR funclets use a special "Previous Stack Pointer Symbol" slot on the +// stack. It holds a pointer to the bottom of the root function frame. The +// establisher frame pointer passed to a nested funclet may point to the +// (mostly empty) frame of its parent funclet, but it will need to find +// the frame of the root function to access locals. To facilitate this, +// every funclet copies the pointer to the bottom of the root function +// frame into a PSPSym slot in its own (mostly empty) stack frame. Using the +// same offset for the PSPSym in the root function frame that's used in the +// funclets' frames allows each funclet to dynamically accept any ancestor +// frame as its establisher argument (the runtime doesn't guarantee the +// immediate parent for some reason lost to history), and also allows the GC, +// which uses the PSPSym for some bookkeeping, to find it in any funclet's +// frame with only a single offset reported for the entire method. +unsigned +X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const { + const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo(); + // getFrameIndexReferenceFromSP has an out ref parameter for the stack + // pointer register; pass a dummy that we ignore + unsigned SPReg; + int Offset = getFrameIndexReferenceFromSP(MF, Info.PSPSymFrameIdx, SPReg); + assert(Offset >= 0); + return static_cast<unsigned>(Offset); +} + +unsigned +X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { + // This is the size of the pushed CSRs. + unsigned CSSize = + MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize(); + // This is the amount of stack a funclet needs to allocate. + unsigned UsedSize; + EHPersonality Personality = + classifyEHPersonality(MF.getFunction()->getPersonalityFn()); + if (Personality == EHPersonality::CoreCLR) { + // CLR funclets need to hold enough space to include the PSPSym, at the + // same offset from the stack pointer (immediately after the prolog) as it + // resides at in the main function. + UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize; + } else { + // Other funclets just need enough stack for outgoing call arguments. + UsedSize = MF.getFrameInfo()->getMaxCallFrameSize(); + } + // RBP is not included in the callee saved register block. After pushing RBP, + // everything is 16 byte aligned. Everything we allocate before an outgoing + // call must also be 16 byte aligned. + unsigned FrameSizeMinusRBP = + RoundUpToAlignment(CSSize + UsedSize, getStackAlignment()); + // Subtract out the size of the callee saved registers. This is how much stack + // each funclet will allocate. + return FrameSizeMinusRBP - CSSize; +} + +void X86FrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + DebugLoc DL; + if (MBBI != MBB.end()) + DL = MBBI->getDebugLoc(); + // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. + const bool Is64BitILP32 = STI.isTarget64BitILP32(); + unsigned FramePtr = TRI->getFrameRegister(MF); + unsigned MachineFramePtr = + Is64BitILP32 ? getX86SubSuperRegister(FramePtr, 64) : FramePtr; + + bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool NeedsWinCFI = + IsWin64Prologue && MF.getFunction()->needsUnwindTableEntry(); + bool IsFunclet = isFuncletReturnInstr(MBBI); + MachineBasicBlock *TargetMBB = nullptr; + + // Get the number of bytes to allocate from the FrameInfo. + uint64_t StackSize = MFI->getStackSize(); + uint64_t MaxAlign = calculateMaxStackAlign(MF); + unsigned CSSize = X86FI->getCalleeSavedFrameSize(); + uint64_t NumBytes = 0; + + if (MBBI->getOpcode() == X86::CATCHRET) { + // SEH shouldn't use catchret. + assert(!isAsynchronousEHPersonality( + classifyEHPersonality(MF.getFunction()->getPersonalityFn())) && + "SEH should not use CATCHRET"); + + NumBytes = getWinEHFuncletFrameSize(MF); + assert(hasFP(MF) && "EH funclets without FP not yet implemented"); + TargetMBB = MBBI->getOperand(0).getMBB(); + + // Pop EBP. + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), + MachineFramePtr) + .setMIFlag(MachineInstr::FrameDestroy); + } else if (MBBI->getOpcode() == X86::CLEANUPRET) { + NumBytes = getWinEHFuncletFrameSize(MF); + assert(hasFP(MF) && "EH funclets without FP not yet implemented"); + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), + MachineFramePtr) + .setMIFlag(MachineInstr::FrameDestroy); + } else if (hasFP(MF)) { + // Calculate required stack adjustment. + uint64_t FrameSize = StackSize - SlotSize; + NumBytes = FrameSize - CSSize; + + // Callee-saved registers were pushed on stack before the stack was + // realigned. + if (TRI->needsStackRealignment(MF) && !IsWin64Prologue) + NumBytes = RoundUpToAlignment(FrameSize, MaxAlign); + + // Pop EBP. + BuildMI(MBB, MBBI, DL, + TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr) + .setMIFlag(MachineInstr::FrameDestroy); + } else { + NumBytes = StackSize - CSSize; + } + uint64_t SEHStackAllocAmt = NumBytes; + + // Skip the callee-saved pop instructions. + while (MBBI != MBB.begin()) { + MachineBasicBlock::iterator PI = std::prev(MBBI); + unsigned Opc = PI->getOpcode(); + + if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) && + (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)) && + Opc != X86::DBG_VALUE && !PI->isTerminator()) + break; + + --MBBI; + } + MachineBasicBlock::iterator FirstCSPop = MBBI; + + if (TargetMBB) { + // Fill EAX/RAX with the address of the target block. + unsigned ReturnReg = STI.is64Bit() ? X86::RAX : X86::EAX; + if (STI.is64Bit()) { + // LEA64r TargetMBB(%rip), %rax + BuildMI(MBB, FirstCSPop, DL, TII.get(X86::LEA64r), ReturnReg) + .addReg(X86::RIP) + .addImm(0) + .addReg(0) + .addMBB(TargetMBB) + .addReg(0); + } else { + // MOV32ri $TargetMBB, %eax + BuildMI(MBB, FirstCSPop, DL, TII.get(X86::MOV32ri), ReturnReg) + .addMBB(TargetMBB); + } + // Record that we've taken the address of TargetMBB and no longer just + // reference it in a terminator. + TargetMBB->setHasAddressTaken(); + } + + if (MBBI != MBB.end()) + DL = MBBI->getDebugLoc(); + + // If there is an ADD32ri or SUB32ri of ESP immediately before this + // instruction, merge the two instructions. + if (NumBytes || MFI->hasVarSizedObjects()) + NumBytes += mergeSPUpdates(MBB, MBBI, true); + + // If dynamic alloca is used, then reset esp to point to the last callee-saved + // slot before popping them off! Same applies for the case, when stack was + // realigned. Don't do this if this was a funclet epilogue, since the funclets + // will not do realignment or dynamic stack allocation. + if ((TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) && + !IsFunclet) { + if (TRI->needsStackRealignment(MF)) + MBBI = FirstCSPop; + unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt); + uint64_t LEAAmount = + IsWin64Prologue ? SEHStackAllocAmt - SEHFrameOffset : -CSSize; + + // There are only two legal forms of epilogue: + // - add SEHAllocationSize, %rsp + // - lea SEHAllocationSize(%FramePtr), %rsp + // + // 'mov %FramePtr, %rsp' will not be recognized as an epilogue sequence. + // However, we may use this sequence if we have a frame pointer because the + // effects of the prologue can safely be undone. + if (LEAAmount != 0) { + unsigned Opc = getLEArOpcode(Uses64BitFramePtr); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), + FramePtr, false, LEAAmount); + --MBBI; + } else { + unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr); + BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(FramePtr); + --MBBI; + } + } else if (NumBytes) { + // Adjust stack pointer back: ESP += numbytes. + emitSPUpdate(MBB, MBBI, NumBytes, /*InEpilogue=*/true); + --MBBI; + } + + // Windows unwinder will not invoke function's exception handler if IP is + // either in prologue or in epilogue. This behavior causes a problem when a + // call immediately precedes an epilogue, because the return address points + // into the epilogue. To cope with that, we insert an epilogue marker here, + // then replace it with a 'nop' if it ends up immediately after a CALL in the + // final emitted code. + if (NeedsWinCFI) + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue)); + + // Add the return addr area delta back since we are not tail calling. + int Offset = -1 * X86FI->getTCReturnAddrDelta(); + assert(Offset >= 0 && "TCDelta should never be positive"); + if (Offset) { + MBBI = MBB.getFirstTerminator(); + + // Check for possible merge with preceding ADD instruction. + Offset += mergeSPUpdates(MBB, MBBI, true); + emitSPUpdate(MBB, MBBI, Offset, /*InEpilogue=*/true); + } +} + +// NOTE: this only has a subset of the full frame index logic. In +// particular, the FI < 0 and AfterFPPop logic is handled in +// X86RegisterInfo::eliminateFrameIndex, but not here. Possibly +// (probably?) it should be moved into here. +int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // We can't calculate offset from frame pointer if the stack is realigned, + // so enforce usage of stack/base pointer. The base pointer is used when we + // have dynamic allocas in addition to dynamic realignment. + if (TRI->hasBasePointer(MF)) + FrameReg = TRI->getBaseRegister(); + else if (TRI->needsStackRealignment(MF)) + FrameReg = TRI->getStackRegister(); + else + FrameReg = TRI->getFrameRegister(MF); + + // Offset will hold the offset from the stack pointer at function entry to the + // object. + // We need to factor in additional offsets applied during the prologue to the + // frame, base, and stack pointer depending on which is used. + int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); + const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + unsigned CSSize = X86FI->getCalleeSavedFrameSize(); + uint64_t StackSize = MFI->getStackSize(); + bool HasFP = hasFP(MF); + bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + int64_t FPDelta = 0; + + if (IsWin64Prologue) { + assert(!MFI->hasCalls() || (StackSize % 16) == 8); + + // Calculate required stack adjustment. + uint64_t FrameSize = StackSize - SlotSize; + // If required, include space for extra hidden slot for stashing base pointer. + if (X86FI->getRestoreBasePointer()) + FrameSize += SlotSize; + uint64_t NumBytes = FrameSize - CSSize; + + uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes); + if (FI && FI == X86FI->getFAIndex()) + return -SEHFrameOffset; + + // FPDelta is the offset from the "traditional" FP location of the old base + // pointer followed by return address and the location required by the + // restricted Win64 prologue. + // Add FPDelta to all offsets below that go through the frame pointer. + FPDelta = FrameSize - SEHFrameOffset; + assert((!MFI->hasCalls() || (FPDelta % 16) == 0) && + "FPDelta isn't aligned per the Win64 ABI!"); + } + + + if (TRI->hasBasePointer(MF)) { + assert(HasFP && "VLAs and dynamic stack realign, but no FP?!"); + if (FI < 0) { + // Skip the saved EBP. + return Offset + SlotSize + FPDelta; + } else { + assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); + return Offset + StackSize; + } + } else if (TRI->needsStackRealignment(MF)) { + if (FI < 0) { + // Skip the saved EBP. + return Offset + SlotSize + FPDelta; + } else { + assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); + return Offset + StackSize; + } + // FIXME: Support tail calls + } else { + if (!HasFP) + return Offset + StackSize; + + // Skip the saved EBP. + Offset += SlotSize; + + // Skip the RETADDR move area + int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + if (TailCallReturnAddrDelta < 0) + Offset -= TailCallReturnAddrDelta; + } + + return Offset + FPDelta; +} + +// Simplified from getFrameIndexReference keeping only StackPointer cases +int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, + int FI, + unsigned &FrameReg) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + // Does not include any dynamic realign. + const uint64_t StackSize = MFI->getStackSize(); + { +#ifndef NDEBUG + // LLVM arranges the stack as follows: + // ... + // ARG2 + // ARG1 + // RETADDR + // PUSH RBP <-- RBP points here + // PUSH CSRs + // ~~~~~~~ <-- possible stack realignment (non-win64) + // ... + // STACK OBJECTS + // ... <-- RSP after prologue points here + // ~~~~~~~ <-- possible stack realignment (win64) + // + // if (hasVarSizedObjects()): + // ... <-- "base pointer" (ESI/RBX) points here + // DYNAMIC ALLOCAS + // ... <-- RSP points here + // + // Case 1: In the simple case of no stack realignment and no dynamic + // allocas, both "fixed" stack objects (arguments and CSRs) are addressable + // with fixed offsets from RSP. + // + // Case 2: In the case of stack realignment with no dynamic allocas, fixed + // stack objects are addressed with RBP and regular stack objects with RSP. + // + // Case 3: In the case of dynamic allocas and stack realignment, RSP is used + // to address stack arguments for outgoing calls and nothing else. The "base + // pointer" points to local variables, and RBP points to fixed objects. + // + // In cases 2 and 3, we can only answer for non-fixed stack objects, and the + // answer we give is relative to the SP after the prologue, and not the + // SP in the middle of the function. + + assert((!MFI->isFixedObjectIndex(FI) || !TRI->needsStackRealignment(MF) || + STI.isTargetWin64()) && + "offset from fixed object to SP is not static"); + + // We don't handle tail calls, and shouldn't be seeing them either. + int TailCallReturnAddrDelta = + MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta(); + assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!"); +#endif + } + + // Fill in FrameReg output argument. + FrameReg = TRI->getStackRegister(); + + // This is how the math works out: + // + // %rsp grows (i.e. gets lower) left to right. Each box below is + // one word (eight bytes). Obj0 is the stack slot we're trying to + // get to. + // + // ---------------------------------- + // | BP | Obj0 | Obj1 | ... | ObjN | + // ---------------------------------- + // ^ ^ ^ ^ + // A B C E + // + // A is the incoming stack pointer. + // (B - A) is the local area offset (-8 for x86-64) [1] + // (C - A) is the Offset returned by MFI->getObjectOffset for Obj0 [2] + // + // |(E - B)| is the StackSize (absolute value, positive). For a + // stack that grown down, this works out to be (B - E). [3] + // + // E is also the value of %rsp after stack has been set up, and we + // want (C - E) -- the value we can add to %rsp to get to Obj0. Now + // (C - E) == (C - A) - (B - A) + (B - E) + // { Using [1], [2] and [3] above } + // == getObjectOffset - LocalAreaOffset + StackSize + // + + // Get the Offset from the StackPointer + int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); + + return Offset + StackSize; +} + +bool X86FrameLowering::assignCalleeSavedSpillSlots( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + + unsigned CalleeSavedFrameSize = 0; + int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta(); + + if (hasFP(MF)) { + // emitPrologue always spills frame register the first thing. + SpillSlotOffset -= SlotSize; + MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); + + // Since emitPrologue and emitEpilogue will handle spilling and restoring of + // the frame register, we can delete it from CSI list and not have to worry + // about avoiding it later. + unsigned FPReg = TRI->getFrameRegister(MF); + for (unsigned i = 0; i < CSI.size(); ++i) { + if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) { + CSI.erase(CSI.begin() + i); + break; + } + } + } + + // Assign slots for GPRs. It increases frame size. + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i - 1].getReg(); + + if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) + continue; + + SpillSlotOffset -= SlotSize; + CalleeSavedFrameSize += SlotSize; + + int SlotIndex = MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); + CSI[i - 1].setFrameIdx(SlotIndex); + } + + X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize); + + // Assign slots for XMMs. + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i - 1].getReg(); + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) + continue; + + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + // ensure alignment + SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment(); + // spill into slot + SpillSlotOffset -= RC->getSize(); + int SlotIndex = + MFI->CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset); + CSI[i - 1].setFrameIdx(SlotIndex); + MFI->ensureMaxAlignment(RC->getAlignment()); + } + + return true; +} + +bool X86FrameLowering::spillCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + DebugLoc DL = MBB.findDebugLoc(MI); + + // Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI + // for us, and there are no XMM CSRs on Win32. + if (MBB.isEHFuncletEntry() && STI.is32Bit() && STI.isOSWindows()) + return true; + + // Push GPRs. It increases frame size. + unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i - 1].getReg(); + + if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) + continue; + // Add the callee-saved register as live-in. It's killed at the spill. + MBB.addLiveIn(Reg); + + BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + } + + // Make XMM regs spilled. X86 does not have ability of push/pop XMM. + // It can be done by spilling XMMs to stack frame. + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) + continue; + // Add the callee-saved register as live-in. It's killed at the spill. + MBB.addLiveIn(Reg); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + + TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC, + TRI); + --MI; + MI->setFlag(MachineInstr::FrameSetup); + ++MI; + } + + return true; +} + +bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + if (isFuncletReturnInstr(MI) && STI.isOSWindows()) { + // Don't restore CSRs in 32-bit EH funclets. Matches + // spillCalleeSavedRegisters. + if (STI.is32Bit()) + return true; + // Don't restore CSRs before an SEH catchret. SEH except blocks do not form + // funclets. emitEpilogue transforms these to normal jumps. + if (MI->getOpcode() == X86::CATCHRET) { + const Function *Func = MBB.getParent()->getFunction(); + bool IsSEH = isAsynchronousEHPersonality( + classifyEHPersonality(Func->getPersonalityFn())); + if (IsSEH) + return true; + } + } + + DebugLoc DL = MBB.findDebugLoc(MI); + + // Reload XMMs from stack frame. + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + if (X86::GR64RegClass.contains(Reg) || + X86::GR32RegClass.contains(Reg)) + continue; + + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI); + } + + // POP GPRs. + unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r; + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + if (!X86::GR64RegClass.contains(Reg) && + !X86::GR32RegClass.contains(Reg)) + continue; + + BuildMI(MBB, MI, DL, TII.get(Opc), Reg) + .setMIFlag(MachineInstr::FrameDestroy); + } + return true; +} + +void X86FrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + + MachineFrameInfo *MFI = MF.getFrameInfo(); + + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + + if (TailCallReturnAddrDelta < 0) { + // create RETURNADDR area + // arg + // arg + // RETADDR + // { ... + // RETADDR area + // ... + // } + // [EBP] + MFI->CreateFixedObject(-TailCallReturnAddrDelta, + TailCallReturnAddrDelta - SlotSize, true); + } + + // Spill the BasePtr if it's used. + if (TRI->hasBasePointer(MF)) { + SavedRegs.set(TRI->getBaseRegister()); + + // Allocate a spill slot for EBP if we have a base pointer and EH funclets. + if (MF.getMMI().hasEHFunclets()) { + int FI = MFI->CreateSpillStackObject(SlotSize, SlotSize); + X86FI->setHasSEHFramePtrSave(true); + X86FI->setSEHFramePtrSaveIndex(FI); + } + } +} + +static bool +HasNestArgument(const MachineFunction *MF) { + const Function *F = MF->getFunction(); + for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; I++) { + if (I->hasNestAttr()) + return true; + } + return false; +} + +/// GetScratchRegister - Get a temp register for performing work in the +/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform +/// and the properties of the function either one or two registers will be +/// needed. Set primary to true for the first register, false for the second. +static unsigned +GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) { + CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv(); + + // Erlang stuff. + if (CallingConvention == CallingConv::HiPE) { + if (Is64Bit) + return Primary ? X86::R14 : X86::R13; + else + return Primary ? X86::EBX : X86::EDI; + } + + if (Is64Bit) { + if (IsLP64) + return Primary ? X86::R11 : X86::R12; + else + return Primary ? X86::R11D : X86::R12D; + } + + bool IsNested = HasNestArgument(&MF); + + if (CallingConvention == CallingConv::X86_FastCall || + CallingConvention == CallingConv::Fast) { + if (IsNested) + report_fatal_error("Segmented stacks does not support fastcall with " + "nested function."); + return Primary ? X86::EAX : X86::ECX; + } + if (IsNested) + return Primary ? X86::EDX : X86::EAX; + return Primary ? X86::ECX : X86::EAX; +} + +// The stack limit in the TCB is set to this many bytes above the actual stack +// limit. +static const uint64_t kSplitStackAvailable = 256; + +void X86FrameLowering::adjustForSegmentedStacks( + MachineFunction &MF, MachineBasicBlock &PrologueMBB) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + uint64_t StackSize; + unsigned TlsReg, TlsOffset; + DebugLoc DL; + + unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true); + assert(!MF.getRegInfo().isLiveIn(ScratchReg) && + "Scratch register is live-in"); + + if (MF.getFunction()->isVarArg()) + report_fatal_error("Segmented stacks do not support vararg functions."); + if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() && + !STI.isTargetWin64() && !STI.isTargetFreeBSD() && + !STI.isTargetDragonFly()) + report_fatal_error("Segmented stacks not supported on this platform."); + + // Eventually StackSize will be calculated by a link-time pass; which will + // also decide whether checking code needs to be injected into this particular + // prologue. + StackSize = MFI->getStackSize(); + + // Do not generate a prologue for functions with a stack of size zero + if (StackSize == 0) + return; + + MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + bool IsNested = false; + + // We need to know if the function has a nest argument only in 64 bit mode. + if (Is64Bit) + IsNested = HasNestArgument(&MF); + + // The MOV R10, RAX needs to be in a different block, since the RET we emit in + // allocMBB needs to be last (terminating) instruction. + + for (const auto &LI : PrologueMBB.liveins()) { + allocMBB->addLiveIn(LI); + checkMBB->addLiveIn(LI); + } + + if (IsNested) + allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D); + + MF.push_front(allocMBB); + MF.push_front(checkMBB); + + // When the frame size is less than 256 we just compare the stack + // boundary directly to the value of the stack pointer, per gcc. + bool CompareStackPointer = StackSize < kSplitStackAvailable; + + // Read the limit off the current stacklet off the stack_guard location. + if (Is64Bit) { + if (STI.isTargetLinux()) { + TlsReg = X86::FS; + TlsOffset = IsLP64 ? 0x70 : 0x40; + } else if (STI.isTargetDarwin()) { + TlsReg = X86::GS; + TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90. + } else if (STI.isTargetWin64()) { + TlsReg = X86::GS; + TlsOffset = 0x28; // pvArbitrary, reserved for application use + } else if (STI.isTargetFreeBSD()) { + TlsReg = X86::FS; + TlsOffset = 0x18; + } else if (STI.isTargetDragonFly()) { + TlsReg = X86::FS; + TlsOffset = 0x20; // use tls_tcb.tcb_segstack + } else { + report_fatal_error("Segmented stacks not supported on this platform."); + } + + if (CompareStackPointer) + ScratchReg = IsLP64 ? X86::RSP : X86::ESP; + else + BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP) + .addImm(1).addReg(0).addImm(-StackSize).addReg(0); + + BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg) + .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg); + } else { + if (STI.isTargetLinux()) { + TlsReg = X86::GS; + TlsOffset = 0x30; + } else if (STI.isTargetDarwin()) { + TlsReg = X86::GS; + TlsOffset = 0x48 + 90*4; + } else if (STI.isTargetWin32()) { + TlsReg = X86::FS; + TlsOffset = 0x14; // pvArbitrary, reserved for application use + } else if (STI.isTargetDragonFly()) { + TlsReg = X86::FS; + TlsOffset = 0x10; // use tls_tcb.tcb_segstack + } else if (STI.isTargetFreeBSD()) { + report_fatal_error("Segmented stacks not supported on FreeBSD i386."); + } else { + report_fatal_error("Segmented stacks not supported on this platform."); + } + + if (CompareStackPointer) + ScratchReg = X86::ESP; + else + BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP) + .addImm(1).addReg(0).addImm(-StackSize).addReg(0); + + if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() || + STI.isTargetDragonFly()) { + BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg) + .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg); + } else if (STI.isTargetDarwin()) { + + // TlsOffset doesn't fit into a mod r/m byte so we need an extra register. + unsigned ScratchReg2; + bool SaveScratch2; + if (CompareStackPointer) { + // The primary scratch register is available for holding the TLS offset. + ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true); + SaveScratch2 = false; + } else { + // Need to use a second register to hold the TLS offset + ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false); + + // Unfortunately, with fastcc the second scratch register may hold an + // argument. + SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2); + } + + // If Scratch2 is live-in then it needs to be saved. + assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) && + "Scratch register is live-in and not saved"); + + if (SaveScratch2) + BuildMI(checkMBB, DL, TII.get(X86::PUSH32r)) + .addReg(ScratchReg2, RegState::Kill); + + BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2) + .addImm(TlsOffset); + BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)) + .addReg(ScratchReg) + .addReg(ScratchReg2).addImm(1).addReg(0) + .addImm(0) + .addReg(TlsReg); + + if (SaveScratch2) + BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2); + } + } + + // This jump is taken if SP >= (Stacklet Limit + Stack Space required). + // It jumps to normal execution of the function body. + BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&PrologueMBB); + + // On 32 bit we first push the arguments size and then the frame size. On 64 + // bit, we pass the stack frame size in r10 and the argument size in r11. + if (Is64Bit) { + // Functions with nested arguments use R10, so it needs to be saved across + // the call to _morestack + + const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX; + const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D; + const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D; + const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr; + const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri; + + if (IsNested) + BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10); + + BuildMI(allocMBB, DL, TII.get(MOVri), Reg10) + .addImm(StackSize); + BuildMI(allocMBB, DL, TII.get(MOVri), Reg11) + .addImm(X86FI->getArgumentStackSize()); + } else { + BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) + .addImm(X86FI->getArgumentStackSize()); + BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) + .addImm(StackSize); + } + + // __morestack is in libgcc + if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) { + // Under the large code model, we cannot assume that __morestack lives + // within 2^31 bytes of the call site, so we cannot use pc-relative + // addressing. We cannot perform the call via a temporary register, + // as the rax register may be used to store the static chain, and all + // other suitable registers may be either callee-save or used for + // parameter passing. We cannot use the stack at this point either + // because __morestack manipulates the stack directly. + // + // To avoid these issues, perform an indirect call via a read-only memory + // location containing the address. + // + // This solution is not perfect, as it assumes that the .rodata section + // is laid out within 2^31 bytes of each function body, but this seems + // to be sufficient for JIT. + BuildMI(allocMBB, DL, TII.get(X86::CALL64m)) + .addReg(X86::RIP) + .addImm(0) + .addReg(0) + .addExternalSymbol("__morestack_addr") + .addReg(0); + MF.getMMI().setUsesMorestackAddr(true); + } else { + if (Is64Bit) + BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32)) + .addExternalSymbol("__morestack"); + else + BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32)) + .addExternalSymbol("__morestack"); + } + + if (IsNested) + BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10)); + else + BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET)); + + allocMBB->addSuccessor(&PrologueMBB); + + checkMBB->addSuccessor(allocMBB); + checkMBB->addSuccessor(&PrologueMBB); + +#ifdef XDEBUG + MF.verify(); +#endif +} + +/// Erlang programs may need a special prologue to handle the stack size they +/// might need at runtime. That is because Erlang/OTP does not implement a C +/// stack but uses a custom implementation of hybrid stack/heap architecture. +/// (for more information see Eric Stenman's Ph.D. thesis: +/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf) +/// +/// CheckStack: +/// temp0 = sp - MaxStack +/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart +/// OldStart: +/// ... +/// IncStack: +/// call inc_stack # doubles the stack space +/// temp0 = sp - MaxStack +/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart +void X86FrameLowering::adjustForHiPEPrologue( + MachineFunction &MF, MachineBasicBlock &PrologueMBB) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + DebugLoc DL; + // HiPE-specific values + const unsigned HipeLeafWords = 24; + const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5; + const unsigned Guaranteed = HipeLeafWords * SlotSize; + unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ? + MF.getFunction()->arg_size() - CCRegisteredArgs : 0; + unsigned MaxStack = MFI->getStackSize() + CallerStkArity*SlotSize + SlotSize; + + assert(STI.isTargetLinux() && + "HiPE prologue is only supported on Linux operating systems."); + + // Compute the largest caller's frame that is needed to fit the callees' + // frames. This 'MaxStack' is computed from: + // + // a) the fixed frame size, which is the space needed for all spilled temps, + // b) outgoing on-stack parameter areas, and + // c) the minimum stack space this function needs to make available for the + // functions it calls (a tunable ABI property). + if (MFI->hasCalls()) { + unsigned MoreStackForCalls = 0; + + for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end(); + MBBI != MBBE; ++MBBI) + for (MachineBasicBlock::iterator MI = MBBI->begin(), ME = MBBI->end(); + MI != ME; ++MI) { + if (!MI->isCall()) + continue; + + // Get callee operand. + const MachineOperand &MO = MI->getOperand(0); + + // Only take account of global function calls (no closures etc.). + if (!MO.isGlobal()) + continue; + + const Function *F = dyn_cast<Function>(MO.getGlobal()); + if (!F) + continue; + + // Do not update 'MaxStack' for primitive and built-in functions + // (encoded with names either starting with "erlang."/"bif_" or not + // having a ".", such as a simple <Module>.<Function>.<Arity>, or an + // "_", such as the BIF "suspend_0") as they are executed on another + // stack. + if (F->getName().find("erlang.") != StringRef::npos || + F->getName().find("bif_") != StringRef::npos || + F->getName().find_first_of("._") == StringRef::npos) + continue; + + unsigned CalleeStkArity = + F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0; + if (HipeLeafWords - 1 > CalleeStkArity) + MoreStackForCalls = std::max(MoreStackForCalls, + (HipeLeafWords - 1 - CalleeStkArity) * SlotSize); + } + MaxStack += MoreStackForCalls; + } + + // If the stack frame needed is larger than the guaranteed then runtime checks + // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue. + if (MaxStack > Guaranteed) { + MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock(); + + for (const auto &LI : PrologueMBB.liveins()) { + stackCheckMBB->addLiveIn(LI); + incStackMBB->addLiveIn(LI); + } + + MF.push_front(incStackMBB); + MF.push_front(stackCheckMBB); + + unsigned ScratchReg, SPReg, PReg, SPLimitOffset; + unsigned LEAop, CMPop, CALLop; + if (Is64Bit) { + SPReg = X86::RSP; + PReg = X86::RBP; + LEAop = X86::LEA64r; + CMPop = X86::CMP64rm; + CALLop = X86::CALL64pcrel32; + SPLimitOffset = 0x90; + } else { + SPReg = X86::ESP; + PReg = X86::EBP; + LEAop = X86::LEA32r; + CMPop = X86::CMP32rm; + CALLop = X86::CALLpcrel32; + SPLimitOffset = 0x4c; + } + + ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true); + assert(!MF.getRegInfo().isLiveIn(ScratchReg) && + "HiPE prologue scratch register is live-in"); + + // Create new MBB for StackCheck: + addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg), + SPReg, false, -MaxStack); + // SPLimitOffset is in a fixed heap location (pointed by BP). + addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)) + .addReg(ScratchReg), PReg, false, SPLimitOffset); + BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&PrologueMBB); + + // Create new MBB for IncStack: + BuildMI(incStackMBB, DL, TII.get(CALLop)). + addExternalSymbol("inc_stack_0"); + addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg), + SPReg, false, -MaxStack); + addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)) + .addReg(ScratchReg), PReg, false, SPLimitOffset); + BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB); + + stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100}); + stackCheckMBB->addSuccessor(incStackMBB, {1, 100}); + incStackMBB->addSuccessor(&PrologueMBB, {99, 100}); + incStackMBB->addSuccessor(incStackMBB, {1, 100}); + } +#ifdef XDEBUG + MF.verify(); +#endif +} + +bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, int Offset) const { + + if (Offset <= 0) + return false; + + if (Offset % SlotSize) + return false; + + int NumPops = Offset / SlotSize; + // This is only worth it if we have at most 2 pops. + if (NumPops != 1 && NumPops != 2) + return false; + + // Handle only the trivial case where the adjustment directly follows + // a call. This is the most common one, anyway. + if (MBBI == MBB.begin()) + return false; + MachineBasicBlock::iterator Prev = std::prev(MBBI); + if (!Prev->isCall() || !Prev->getOperand(1).isRegMask()) + return false; + + unsigned Regs[2]; + unsigned FoundRegs = 0; + + auto RegMask = Prev->getOperand(1); + + auto &RegClass = + Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass; + // Try to find up to NumPops free registers. + for (auto Candidate : RegClass) { + + // Poor man's liveness: + // Since we're immediately after a call, any register that is clobbered + // by the call and not defined by it can be considered dead. + if (!RegMask.clobbersPhysReg(Candidate)) + continue; + + bool IsDef = false; + for (const MachineOperand &MO : Prev->implicit_operands()) { + if (MO.isReg() && MO.isDef() && MO.getReg() == Candidate) { + IsDef = true; + break; + } + } + + if (IsDef) + continue; + + Regs[FoundRegs++] = Candidate; + if (FoundRegs == (unsigned)NumPops) + break; + } + + if (FoundRegs == 0) + return false; + + // If we found only one free register, but need two, reuse the same one twice. + while (FoundRegs < (unsigned)NumPops) + Regs[FoundRegs++] = Regs[0]; + + for (int i = 0; i < NumPops; ++i) + BuildMI(MBB, MBBI, DL, + TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]); + + return true; +} + +void X86FrameLowering:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + bool reserveCallFrame = hasReservedCallFrame(MF); + unsigned Opcode = I->getOpcode(); + bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); + DebugLoc DL = I->getDebugLoc(); + uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0; + uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0; + I = MBB.erase(I); + + if (!reserveCallFrame) { + // If the stack pointer can be changed after prologue, turn the + // adjcallstackup instruction into a 'sub ESP, <amt>' and the + // adjcallstackdown instruction into 'add ESP, <amt>' + + // We need to keep the stack aligned properly. To do this, we round the + // amount of space needed for the outgoing arguments up to the next + // alignment boundary. + unsigned StackAlign = getStackAlignment(); + Amount = RoundUpToAlignment(Amount, StackAlign); + + MachineModuleInfo &MMI = MF.getMMI(); + const Function *Fn = MF.getFunction(); + bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool DwarfCFI = !WindowsCFI && + (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); + + // If we have any exception handlers in this function, and we adjust + // the SP before calls, we may need to indicate this to the unwinder + // using GNU_ARGS_SIZE. Note that this may be necessary even when + // Amount == 0, because the preceding function may have set a non-0 + // GNU_ARGS_SIZE. + // TODO: We don't need to reset this between subsequent functions, + // if it didn't change. + bool HasDwarfEHHandlers = !WindowsCFI && + !MF.getMMI().getLandingPads().empty(); + + if (HasDwarfEHHandlers && !isDestroy && + MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences()) + BuildCFI(MBB, I, DL, + MCCFIInstruction::createGnuArgsSize(nullptr, Amount)); + + if (Amount == 0) + return; + + // Factor out the amount that gets handled inside the sequence + // (Pushes of argument for frame setup, callee pops for frame destroy) + Amount -= InternalAmt; + + // TODO: This is needed only if we require precise CFA. + // If this is a callee-pop calling convention, emit a CFA adjust for + // the amount the callee popped. + if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF)) + BuildCFI(MBB, I, DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt)); + + if (Amount) { + // Add Amount to SP to destroy a frame, and subtract to setup. + int Offset = isDestroy ? Amount : -Amount; + + if (!(Fn->optForMinSize() && + adjustStackWithPops(MBB, I, DL, Offset))) + BuildStackAdjustment(MBB, I, DL, Offset, /*InEpilogue=*/false); + } + + if (DwarfCFI && !hasFP(MF)) { + // If we don't have FP, but need to generate unwind information, + // we need to set the correct CFA offset after the stack adjustment. + // How much we adjust the CFA offset depends on whether we're emitting + // CFI only for EH purposes or for debugging. EH only requires the CFA + // offset to be correct at each call site, while for debugging we want + // it to be more precise. + int CFAOffset = Amount; + // TODO: When not using precise CFA, we also need to adjust for the + // InternalAmt here. + + if (CFAOffset) { + CFAOffset = isDestroy ? -CFAOffset : CFAOffset; + BuildCFI(MBB, I, DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, CFAOffset)); + } + } + + return; + } + + if (isDestroy && InternalAmt) { + // If we are performing frame pointer elimination and if the callee pops + // something off the stack pointer, add it back. We do this until we have + // more advanced stack pointer tracking ability. + // We are not tracking the stack pointer adjustment by the callee, so make + // sure we restore the stack pointer immediately after the call, there may + // be spill code inserted between the CALL and ADJCALLSTACKUP instructions. + MachineBasicBlock::iterator B = MBB.begin(); + while (I != B && !std::prev(I)->isCall()) + --I; + BuildStackAdjustment(MBB, I, DL, -InternalAmt, /*InEpilogue=*/false); + } +} + +bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const { + assert(MBB.getParent() && "Block is not attached to a function!"); + + // Win64 has strict requirements in terms of epilogue and we are + // not taking a chance at messing with them. + // I.e., unless this block is already an exit block, we can't use + // it as an epilogue. + if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock()) + return false; + + if (canUseLEAForSPInEpilogue(*MBB.getParent())) + return true; + + // If we cannot use LEA to adjust SP, we may need to use ADD, which + // clobbers the EFLAGS. Check that we do not need to preserve it, + // otherwise, conservatively assume this is not + // safe to insert the epilogue here. + return !flagsNeedToBePreservedBeforeTheTerminators(MBB); +} + +bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { + // If we may need to emit frameless compact unwind information, give + // up as this is currently broken: PR25614. + return MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF); +} + +MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + DebugLoc DL, bool RestoreSP) const { + assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env"); + assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32"); + assert(STI.is32Bit() && !Uses64BitFramePtr && + "restoring EBP/ESI on non-32-bit target"); + + MachineFunction &MF = *MBB.getParent(); + unsigned FramePtr = TRI->getFrameRegister(MF); + unsigned BasePtr = TRI->getBaseRegister(); + WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + + // FIXME: Don't set FrameSetup flag in catchret case. + + int FI = FuncInfo.EHRegNodeFrameIndex; + int EHRegSize = MFI->getObjectSize(FI); + + if (RestoreSP) { + // MOV32rm -EHRegSize(%ebp), %esp + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), X86::ESP), + X86::EBP, true, -EHRegSize) + .setMIFlag(MachineInstr::FrameSetup); + } + + unsigned UsedReg; + int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg); + int EndOffset = -EHRegOffset - EHRegSize; + FuncInfo.EHRegNodeEndOffset = EndOffset; + + if (UsedReg == FramePtr) { + // ADD $offset, %ebp + unsigned ADDri = getADDriOpcode(false, EndOffset); + BuildMI(MBB, MBBI, DL, TII.get(ADDri), FramePtr) + .addReg(FramePtr) + .addImm(EndOffset) + .setMIFlag(MachineInstr::FrameSetup) + ->getOperand(3) + .setIsDead(); + assert(EndOffset >= 0 && + "end of registration object above normal EBP position!"); + } else if (UsedReg == BasePtr) { + // LEA offset(%ebp), %esi + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA32r), BasePtr), + FramePtr, false, EndOffset) + .setMIFlag(MachineInstr::FrameSetup); + // MOV32rm SavedEBPOffset(%esi), %ebp + assert(X86FI->getHasSEHFramePtrSave()); + int Offset = + getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg); + assert(UsedReg == BasePtr); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr), + UsedReg, true, Offset) + .setMIFlag(MachineInstr::FrameSetup); + } else { + llvm_unreachable("32-bit frames with WinEH must use FramePtr or BasePtr"); + } + return MBBI; +} + +unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const { + // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue. + unsigned Offset = 16; + // RBP is immediately pushed. + Offset += SlotSize; + // All callee-saved registers are then pushed. + Offset += MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize(); + // Every funclet allocates enough stack space for the largest outgoing call. + Offset += getWinEHFuncletFrameSize(MF); + return Offset; +} + +void X86FrameLowering::processFunctionBeforeFrameFinalized( + MachineFunction &MF, RegScavenger *RS) const { + // If this function isn't doing Win64-style C++ EH, we don't need to do + // anything. + const Function *Fn = MF.getFunction(); + if (!STI.is64Bit() || !MF.getMMI().hasEHFunclets() || + classifyEHPersonality(Fn->getPersonalityFn()) != EHPersonality::MSVC_CXX) + return; + + // Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset + // relative to RSP after the prologue. Find the offset of the last fixed + // object, so that we can allocate a slot immediately following it. If there + // were no fixed objects, use offset -SlotSize, which is immediately after the + // return address. Fixed objects have negative frame indices. + MachineFrameInfo *MFI = MF.getFrameInfo(); + int64_t MinFixedObjOffset = -SlotSize; + for (int I = MFI->getObjectIndexBegin(); I < 0; ++I) + MinFixedObjOffset = std::min(MinFixedObjOffset, MFI->getObjectOffset(I)); + + int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize; + int UnwindHelpFI = + MFI->CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false); + MF.getWinEHFuncInfo()->UnwindHelpFrameIdx = UnwindHelpFI; + + // Store -2 into UnwindHelp on function entry. We have to scan forwards past + // other frame setup instructions. + MachineBasicBlock &MBB = MF.front(); + auto MBBI = MBB.begin(); + while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) + ++MBBI; + + DebugLoc DL = MBB.findDebugLoc(MBBI); + addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)), + UnwindHelpFI) + .addImm(-2); +} diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h new file mode 100644 index 0000000..3ab41b4 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h @@ -0,0 +1,203 @@ +//===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class implements X86-specific bits of TargetFrameLowering class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H +#define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H + +#include "llvm/Target/TargetFrameLowering.h" + +namespace llvm { + +class MachineInstrBuilder; +class MCCFIInstruction; +class X86Subtarget; +class X86RegisterInfo; + +class X86FrameLowering : public TargetFrameLowering { +public: + X86FrameLowering(const X86Subtarget &STI, unsigned StackAlignOverride); + + // Cached subtarget predicates. + + const X86Subtarget &STI; + const TargetInstrInfo &TII; + const X86RegisterInfo *TRI; + + unsigned SlotSize; + + /// Is64Bit implies that x86_64 instructions are available. + bool Is64Bit; + + bool IsLP64; + + /// True if the 64-bit frame or stack pointer should be used. True for most + /// 64-bit targets with the exception of x32. If this is false, 32-bit + /// instruction operands should be used to manipulate StackPtr and FramePtr. + bool Uses64BitFramePtr; + + unsigned StackPtr; + + /// Emit target stack probe code. This is required for all + /// large stack allocations on Windows. The caller is required to materialize + /// the number of bytes to probe in RAX/EAX. Returns instruction just + /// after the expansion. + MachineInstr *emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + bool InProlog) const; + + /// Replace a StackProbe inline-stub with the actual probe code inline. + void inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologMBB) const override; + + void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL) const; + + /// emitProlog/emitEpilog - These methods insert prolog and epilog code into + /// the function. + void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + + void adjustForSegmentedStacks(MachineFunction &MF, + MachineBasicBlock &PrologueMBB) const override; + + void adjustForHiPEPrologue(MachineFunction &MF, + MachineBasicBlock &PrologueMBB) const override; + + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS = nullptr) const override; + + bool + assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const override; + + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const override; + + bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const override; + + bool hasFP(const MachineFunction &MF) const override; + bool hasReservedCallFrame(const MachineFunction &MF) const override; + bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override; + bool needsFrameIndexResolution(const MachineFunction &MF) const override; + + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; + + int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; + + void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const override; + + unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override; + + void processFunctionBeforeFrameFinalized(MachineFunction &MF, + RegScavenger *RS) const override; + + /// Check the instruction before/after the passed instruction. If + /// it is an ADD/SUB/LEA instruction it is deleted argument and the + /// stack adjustment is returned as a positive value for ADD/LEA and + /// a negative for SUB. + int mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + bool doMergeWithPrevious) const; + + /// Emit a series of instructions to increment / decrement the stack + /// pointer by a constant value. + void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + int64_t NumBytes, bool InEpilogue) const; + + /// Check that LEA can be used on SP in an epilogue sequence for \p MF. + bool canUseLEAForSPInEpilogue(const MachineFunction &MF) const; + + /// Check whether or not the given \p MBB can be used as a epilogue + /// for the target. + /// The epilogue will be inserted before the first terminator of that block. + /// This method is used by the shrink-wrapping pass to decide if + /// \p MBB will be correctly handled by the target. + bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override; + + /// Returns true if the target will correctly handle shrink wrapping. + bool enableShrinkWrapping(const MachineFunction &MF) const override; + + /// convertArgMovsToPushes - This method tries to convert a call sequence + /// that uses sub and mov instructions to put the argument onto the stack + /// into a series of pushes. + /// Returns true if the transformation succeeded, false if not. + bool convertArgMovsToPushes(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + uint64_t Amount) const; + + /// Wraps up getting a CFI index and building a MachineInstr for it. + void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + DebugLoc DL, MCCFIInstruction CFIInst) const; + + /// Sets up EBP and optionally ESI based on the incoming EBP value. Only + /// needed for 32-bit. Used in funclet prologues and at catchret destinations. + MachineBasicBlock::iterator + restoreWin32EHStackPointers(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + bool RestoreSP = false) const; + +private: + uint64_t calculateMaxStackAlign(const MachineFunction &MF) const; + + /// Emit target stack probe as a call to a helper function + MachineInstr *emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, bool InProlog) const; + + /// Emit target stack probe as an inline sequence. + MachineInstr *emitStackProbeInline(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, bool InProlog) const; + + /// Emit a stub to later inline the target stack probe. + MachineInstr *emitStackProbeInlineStub(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, bool InProlog) const; + + /// Aligns the stack pointer by ANDing it with -MaxAlign. + void BuildStackAlignAND(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + unsigned Reg, uint64_t MaxAlign) const; + + /// Make small positive stack adjustments using POPs. + bool adjustStackWithPops(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + int Offset) const; + + /// Adjusts the stack pointer using LEA, SUB, or ADD. + MachineInstrBuilder BuildStackAdjustment(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, int64_t Offset, + bool InEpilogue) const; + + unsigned getPSPSlotOffsetFromSP(const MachineFunction &MF) const; + + unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const; +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp new file mode 100644 index 0000000..868ae4e --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -0,0 +1,3012 @@ +//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines a DAG pattern matching instruction selector for X86, +// converting from a legalized dag to a X86 dag. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86MachineFunctionInfo.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include <stdint.h> +using namespace llvm; + +#define DEBUG_TYPE "x86-isel" + +STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor"); + +//===----------------------------------------------------------------------===// +// Pattern Matcher Implementation +//===----------------------------------------------------------------------===// + +namespace { + /// This corresponds to X86AddressMode, but uses SDValue's instead of register + /// numbers for the leaves of the matched tree. + struct X86ISelAddressMode { + enum { + RegBase, + FrameIndexBase + } BaseType; + + // This is really a union, discriminated by BaseType! + SDValue Base_Reg; + int Base_FrameIndex; + + unsigned Scale; + SDValue IndexReg; + int32_t Disp; + SDValue Segment; + const GlobalValue *GV; + const Constant *CP; + const BlockAddress *BlockAddr; + const char *ES; + MCSymbol *MCSym; + int JT; + unsigned Align; // CP alignment. + unsigned char SymbolFlags; // X86II::MO_* + + X86ISelAddressMode() + : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0), + Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr), + MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {} + + bool hasSymbolicDisplacement() const { + return GV != nullptr || CP != nullptr || ES != nullptr || + MCSym != nullptr || JT != -1 || BlockAddr != nullptr; + } + + bool hasBaseOrIndexReg() const { + return BaseType == FrameIndexBase || + IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr; + } + + /// Return true if this addressing mode is already RIP-relative. + bool isRIPRelative() const { + if (BaseType != RegBase) return false; + if (RegisterSDNode *RegNode = + dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode())) + return RegNode->getReg() == X86::RIP; + return false; + } + + void setBaseReg(SDValue Reg) { + BaseType = RegBase; + Base_Reg = Reg; + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump() { + dbgs() << "X86ISelAddressMode " << this << '\n'; + dbgs() << "Base_Reg "; + if (Base_Reg.getNode()) + Base_Reg.getNode()->dump(); + else + dbgs() << "nul"; + dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n' + << " Scale" << Scale << '\n' + << "IndexReg "; + if (IndexReg.getNode()) + IndexReg.getNode()->dump(); + else + dbgs() << "nul"; + dbgs() << " Disp " << Disp << '\n' + << "GV "; + if (GV) + GV->dump(); + else + dbgs() << "nul"; + dbgs() << " CP "; + if (CP) + CP->dump(); + else + dbgs() << "nul"; + dbgs() << '\n' + << "ES "; + if (ES) + dbgs() << ES; + else + dbgs() << "nul"; + dbgs() << " MCSym "; + if (MCSym) + dbgs() << MCSym; + else + dbgs() << "nul"; + dbgs() << " JT" << JT << " Align" << Align << '\n'; + } +#endif + }; +} + +namespace { + //===--------------------------------------------------------------------===// + /// ISel - X86-specific code to select X86 machine instructions for + /// SelectionDAG operations. + /// + class X86DAGToDAGISel final : public SelectionDAGISel { + /// Keep a pointer to the X86Subtarget around so that we can + /// make the right decision when generating code for different targets. + const X86Subtarget *Subtarget; + + /// If true, selector should try to optimize for code size instead of + /// performance. + bool OptForSize; + + public: + explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) + : SelectionDAGISel(tm, OptLevel), OptForSize(false) {} + + const char *getPassName() const override { + return "X86 DAG->DAG Instruction Selection"; + } + + bool runOnMachineFunction(MachineFunction &MF) override { + // Reset the subtarget each time through. + Subtarget = &MF.getSubtarget<X86Subtarget>(); + SelectionDAGISel::runOnMachineFunction(MF); + return true; + } + + void EmitFunctionEntryCode() override; + + bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override; + + void PreprocessISelDAG() override; + + inline bool immSext8(SDNode *N) const { + return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue()); + } + + // True if the 64-bit immediate fits in a 32-bit sign-extended field. + inline bool i64immSExt32(SDNode *N) const { + uint64_t v = cast<ConstantSDNode>(N)->getZExtValue(); + return (int64_t)v == (int32_t)v; + } + +// Include the pieces autogenerated from the target description. +#include "X86GenDAGISel.inc" + + private: + SDNode *Select(SDNode *N) override; + SDNode *selectGather(SDNode *N, unsigned Opc); + SDNode *selectAtomicLoadArith(SDNode *Node, MVT NVT); + + bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); + bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); + bool matchWrapper(SDValue N, X86ISelAddressMode &AM); + bool matchAddress(SDValue N, X86ISelAddressMode &AM); + bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth); + bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, + unsigned Depth); + bool matchAddressBase(SDValue N, X86ISelAddressMode &AM); + bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, SDValue &Disp, + SDValue &Segment); + bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, SDValue &Disp, + SDValue &Segment); + bool selectMOV64Imm32(SDValue N, SDValue &Imm); + bool selectLEAAddr(SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, SDValue &Disp, + SDValue &Segment); + bool selectLEA64_32Addr(SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, SDValue &Disp, + SDValue &Segment); + bool selectTLSADDRAddr(SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, SDValue &Disp, + SDValue &Segment); + bool selectScalarSSELoad(SDNode *Root, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment, + SDValue &NodeWithChain); + + bool tryFoldLoad(SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment); + + /// Implement addressing mode selection for inline asm expressions. + bool SelectInlineAsmMemoryOperand(const SDValue &Op, + unsigned ConstraintID, + std::vector<SDValue> &OutOps) override; + + void emitSpecialCodeForMain(); + + inline void getAddressOperands(X86ISelAddressMode &AM, SDLoc DL, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment) { + Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) + ? CurDAG->getTargetFrameIndex( + AM.Base_FrameIndex, + TLI->getPointerTy(CurDAG->getDataLayout())) + : AM.Base_Reg; + Scale = getI8Imm(AM.Scale, DL); + Index = AM.IndexReg; + // These are 32-bit even in 64-bit mode since RIP-relative offset + // is 32-bit. + if (AM.GV) + Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(), + MVT::i32, AM.Disp, + AM.SymbolFlags); + else if (AM.CP) + Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, + AM.Align, AM.Disp, AM.SymbolFlags); + else if (AM.ES) { + assert(!AM.Disp && "Non-zero displacement is ignored with ES."); + Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags); + } else if (AM.MCSym) { + assert(!AM.Disp && "Non-zero displacement is ignored with MCSym."); + assert(AM.SymbolFlags == 0 && "oo"); + Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32); + } else if (AM.JT != -1) { + assert(!AM.Disp && "Non-zero displacement is ignored with JT."); + Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags); + } else if (AM.BlockAddr) + Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp, + AM.SymbolFlags); + else + Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32); + + if (AM.Segment.getNode()) + Segment = AM.Segment; + else + Segment = CurDAG->getRegister(0, MVT::i32); + } + + // Utility function to determine whether we should avoid selecting + // immediate forms of instructions for better code size or not. + // At a high level, we'd like to avoid such instructions when + // we have similar constants used within the same basic block + // that can be kept in a register. + // + bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const { + uint32_t UseCount = 0; + + // Do not want to hoist if we're not optimizing for size. + // TODO: We'd like to remove this restriction. + // See the comment in X86InstrInfo.td for more info. + if (!OptForSize) + return false; + + // Walk all the users of the immediate. + for (SDNode::use_iterator UI = N->use_begin(), + UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) { + + SDNode *User = *UI; + + // This user is already selected. Count it as a legitimate use and + // move on. + if (User->isMachineOpcode()) { + UseCount++; + continue; + } + + // We want to count stores of immediates as real uses. + if (User->getOpcode() == ISD::STORE && + User->getOperand(1).getNode() == N) { + UseCount++; + continue; + } + + // We don't currently match users that have > 2 operands (except + // for stores, which are handled above) + // Those instruction won't match in ISEL, for now, and would + // be counted incorrectly. + // This may change in the future as we add additional instruction + // types. + if (User->getNumOperands() != 2) + continue; + + // Immediates that are used for offsets as part of stack + // manipulation should be left alone. These are typically + // used to indicate SP offsets for argument passing and + // will get pulled into stores/pushes (implicitly). + if (User->getOpcode() == X86ISD::ADD || + User->getOpcode() == ISD::ADD || + User->getOpcode() == X86ISD::SUB || + User->getOpcode() == ISD::SUB) { + + // Find the other operand of the add/sub. + SDValue OtherOp = User->getOperand(0); + if (OtherOp.getNode() == N) + OtherOp = User->getOperand(1); + + // Don't count if the other operand is SP. + RegisterSDNode *RegNode; + if (OtherOp->getOpcode() == ISD::CopyFromReg && + (RegNode = dyn_cast_or_null<RegisterSDNode>( + OtherOp->getOperand(1).getNode()))) + if ((RegNode->getReg() == X86::ESP) || + (RegNode->getReg() == X86::RSP)) + continue; + } + + // ... otherwise, count this and move on. + UseCount++; + } + + // If we have more than 1 use, then recommend for hoisting. + return (UseCount > 1); + } + + /// Return a target constant with the specified value of type i8. + inline SDValue getI8Imm(unsigned Imm, SDLoc DL) { + return CurDAG->getTargetConstant(Imm, DL, MVT::i8); + } + + /// Return a target constant with the specified value, of type i32. + inline SDValue getI32Imm(unsigned Imm, SDLoc DL) { + return CurDAG->getTargetConstant(Imm, DL, MVT::i32); + } + + /// Return an SDNode that returns the value of the global base register. + /// Output instructions required to initialize the global base register, + /// if necessary. + SDNode *getGlobalBaseReg(); + + /// Return a reference to the TargetMachine, casted to the target-specific + /// type. + const X86TargetMachine &getTargetMachine() const { + return static_cast<const X86TargetMachine &>(TM); + } + + /// Return a reference to the TargetInstrInfo, casted to the target-specific + /// type. + const X86InstrInfo *getInstrInfo() const { + return Subtarget->getInstrInfo(); + } + + /// \brief Address-mode matching performs shift-of-and to and-of-shift + /// reassociation in order to expose more scaled addressing + /// opportunities. + bool ComplexPatternFuncMutatesDAG() const override { + return true; + } + }; +} + + +bool +X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { + if (OptLevel == CodeGenOpt::None) return false; + + if (!N.hasOneUse()) + return false; + + if (N.getOpcode() != ISD::LOAD) + return true; + + // If N is a load, do additional profitability checks. + if (U == Root) { + switch (U->getOpcode()) { + default: break; + case X86ISD::ADD: + case X86ISD::SUB: + case X86ISD::AND: + case X86ISD::XOR: + case X86ISD::OR: + case ISD::ADD: + case ISD::ADDC: + case ISD::ADDE: + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + SDValue Op1 = U->getOperand(1); + + // If the other operand is a 8-bit immediate we should fold the immediate + // instead. This reduces code size. + // e.g. + // movl 4(%esp), %eax + // addl $4, %eax + // vs. + // movl $4, %eax + // addl 4(%esp), %eax + // The former is 2 bytes shorter. In case where the increment is 1, then + // the saving can be 4 bytes (by using incl %eax). + if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) + if (Imm->getAPIntValue().isSignedIntN(8)) + return false; + + // If the other operand is a TLS address, we should fold it instead. + // This produces + // movl %gs:0, %eax + // leal i@NTPOFF(%eax), %eax + // instead of + // movl $i@NTPOFF, %eax + // addl %gs:0, %eax + // if the block also has an access to a second TLS address this will save + // a load. + // FIXME: This is probably also true for non-TLS addresses. + if (Op1.getOpcode() == X86ISD::Wrapper) { + SDValue Val = Op1.getOperand(0); + if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) + return false; + } + } + } + } + + return true; +} + +/// Replace the original chain operand of the call with +/// load's chain operand and move load below the call's chain operand. +static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, + SDValue Call, SDValue OrigChain) { + SmallVector<SDValue, 8> Ops; + SDValue Chain = OrigChain.getOperand(0); + if (Chain.getNode() == Load.getNode()) + Ops.push_back(Load.getOperand(0)); + else { + assert(Chain.getOpcode() == ISD::TokenFactor && + "Unexpected chain operand"); + for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) + if (Chain.getOperand(i).getNode() == Load.getNode()) + Ops.push_back(Load.getOperand(0)); + else + Ops.push_back(Chain.getOperand(i)); + SDValue NewChain = + CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops); + Ops.clear(); + Ops.push_back(NewChain); + } + Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end()); + CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops); + CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0), + Load.getOperand(1), Load.getOperand(2)); + + Ops.clear(); + Ops.push_back(SDValue(Load.getNode(), 1)); + Ops.append(Call->op_begin() + 1, Call->op_end()); + CurDAG->UpdateNodeOperands(Call.getNode(), Ops); +} + +/// Return true if call address is a load and it can be +/// moved below CALLSEQ_START and the chains leading up to the call. +/// Return the CALLSEQ_START by reference as a second output. +/// In the case of a tail call, there isn't a callseq node between the call +/// chain and the load. +static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { + // The transformation is somewhat dangerous if the call's chain was glued to + // the call. After MoveBelowOrigChain the load is moved between the call and + // the chain, this can create a cycle if the load is not folded. So it is + // *really* important that we are sure the load will be folded. + if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse()) + return false; + LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode()); + if (!LD || + LD->isVolatile() || + LD->getAddressingMode() != ISD::UNINDEXED || + LD->getExtensionType() != ISD::NON_EXTLOAD) + return false; + + // Now let's find the callseq_start. + while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) { + if (!Chain.hasOneUse()) + return false; + Chain = Chain.getOperand(0); + } + + if (!Chain.getNumOperands()) + return false; + // Since we are not checking for AA here, conservatively abort if the chain + // writes to memory. It's not safe to move the callee (a load) across a store. + if (isa<MemSDNode>(Chain.getNode()) && + cast<MemSDNode>(Chain.getNode())->writeMem()) + return false; + if (Chain.getOperand(0).getNode() == Callee.getNode()) + return true; + if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor && + Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) && + Callee.getValue(1).hasOneUse()) + return true; + return false; +} + +void X86DAGToDAGISel::PreprocessISelDAG() { + // OptForSize is used in pattern predicates that isel is matching. + OptForSize = MF->getFunction()->optForSize(); + + for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), + E = CurDAG->allnodes_end(); I != E; ) { + SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. + + if (OptLevel != CodeGenOpt::None && + // Only does this when target favors doesn't favor register indirect + // call. + ((N->getOpcode() == X86ISD::CALL && !Subtarget->callRegIndirect()) || + (N->getOpcode() == X86ISD::TC_RETURN && + // Only does this if load can be folded into TC_RETURN. + (Subtarget->is64Bit() || + getTargetMachine().getRelocationModel() != Reloc::PIC_)))) { + /// Also try moving call address load from outside callseq_start to just + /// before the call to allow it to be folded. + /// + /// [Load chain] + /// ^ + /// | + /// [Load] + /// ^ ^ + /// | | + /// / \-- + /// / | + ///[CALLSEQ_START] | + /// ^ | + /// | | + /// [LOAD/C2Reg] | + /// | | + /// \ / + /// \ / + /// [CALL] + bool HasCallSeq = N->getOpcode() == X86ISD::CALL; + SDValue Chain = N->getOperand(0); + SDValue Load = N->getOperand(1); + if (!isCalleeLoad(Load, Chain, HasCallSeq)) + continue; + moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain); + ++NumLoadMoved; + continue; + } + + // Lower fpround and fpextend nodes that target the FP stack to be store and + // load to the stack. This is a gross hack. We would like to simply mark + // these as being illegal, but when we do that, legalize produces these when + // it expands calls, then expands these in the same legalize pass. We would + // like dag combine to be able to hack on these between the call expansion + // and the node legalization. As such this pass basically does "really + // late" legalization of these inline with the X86 isel pass. + // FIXME: This should only happen when not compiled with -O0. + if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND) + continue; + + MVT SrcVT = N->getOperand(0).getSimpleValueType(); + MVT DstVT = N->getSimpleValueType(0); + + // If any of the sources are vectors, no fp stack involved. + if (SrcVT.isVector() || DstVT.isVector()) + continue; + + // If the source and destination are SSE registers, then this is a legal + // conversion that should not be lowered. + const X86TargetLowering *X86Lowering = + static_cast<const X86TargetLowering *>(TLI); + bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); + bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); + if (SrcIsSSE && DstIsSSE) + continue; + + if (!SrcIsSSE && !DstIsSSE) { + // If this is an FPStack extension, it is a noop. + if (N->getOpcode() == ISD::FP_EXTEND) + continue; + // If this is a value-preserving FPStack truncation, it is a noop. + if (N->getConstantOperandVal(1)) + continue; + } + + // Here we could have an FP stack truncation or an FPStack <-> SSE convert. + // FPStack has extload and truncstore. SSE can fold direct loads into other + // operations. Based on this, decide what we want to do. + MVT MemVT; + if (N->getOpcode() == ISD::FP_ROUND) + MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'. + else + MemVT = SrcIsSSE ? SrcVT : DstVT; + + SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); + SDLoc dl(N); + + // FIXME: optimize the case where the src/dest is a load or store? + SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, + N->getOperand(0), + MemTmp, MachinePointerInfo(), MemVT, + false, false, 0); + SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, + MachinePointerInfo(), + MemVT, false, false, false, 0); + + // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the + // extload we created. This will cause general havok on the dag because + // anything below the conversion could be folded into other existing nodes. + // To avoid invalidating 'I', back it up to the convert node. + --I; + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); + + // Now that we did that, the node is dead. Increment the iterator to the + // next node to process, then delete N. + ++I; + CurDAG->DeleteNode(N); + } +} + + +/// Emit any code that needs to be executed only in the main function. +void X86DAGToDAGISel::emitSpecialCodeForMain() { + if (Subtarget->isTargetCygMing()) { + TargetLowering::ArgListTy Args; + auto &DL = CurDAG->getDataLayout(); + + TargetLowering::CallLoweringInfo CLI(*CurDAG); + CLI.setChain(CurDAG->getRoot()) + .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()), + CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)), + std::move(Args), 0); + const TargetLowering &TLI = CurDAG->getTargetLoweringInfo(); + std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI); + CurDAG->setRoot(Result.second); + } +} + +void X86DAGToDAGISel::EmitFunctionEntryCode() { + // If this is main, emit special code for main. + if (const Function *Fn = MF->getFunction()) + if (Fn->hasExternalLinkage() && Fn->getName() == "main") + emitSpecialCodeForMain(); +} + +static bool isDispSafeForFrameIndex(int64_t Val) { + // On 64-bit platforms, we can run into an issue where a frame index + // includes a displacement that, when added to the explicit displacement, + // will overflow the displacement field. Assuming that the frame index + // displacement fits into a 31-bit integer (which is only slightly more + // aggressive than the current fundamental assumption that it fits into + // a 32-bit integer), a 31-bit disp should always be safe. + return isInt<31>(Val); +} + +bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, + X86ISelAddressMode &AM) { + // Cannot combine ExternalSymbol displacements with integer offsets. + if (Offset != 0 && (AM.ES || AM.MCSym)) + return true; + int64_t Val = AM.Disp + Offset; + CodeModel::Model M = TM.getCodeModel(); + if (Subtarget->is64Bit()) { + if (!X86::isOffsetSuitableForCodeModel(Val, M, + AM.hasSymbolicDisplacement())) + return true; + // In addition to the checks required for a register base, check that + // we do not try to use an unsafe Disp with a frame index. + if (AM.BaseType == X86ISelAddressMode::FrameIndexBase && + !isDispSafeForFrameIndex(Val)) + return true; + } + AM.Disp = Val; + return false; + +} + +bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ + SDValue Address = N->getOperand(1); + + // load gs:0 -> GS segment register. + // load fs:0 -> FS segment register. + // + // This optimization is valid because the GNU TLS model defines that + // gs:0 (or fs:0 on X86-64) contains its own address. + // For more information see http://people.redhat.com/drepper/tls.pdf + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address)) + if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr && + Subtarget->isTargetLinux()) + switch (N->getPointerInfo().getAddrSpace()) { + case 256: + AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); + return false; + case 257: + AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); + return false; + } + + return true; +} + +/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing +/// mode. These wrap things that will resolve down into a symbol reference. +/// If no match is possible, this returns true, otherwise it returns false. +bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) { + // If the addressing mode already has a symbol as the displacement, we can + // never match another symbol. + if (AM.hasSymbolicDisplacement()) + return true; + + SDValue N0 = N.getOperand(0); + CodeModel::Model M = TM.getCodeModel(); + + // Handle X86-64 rip-relative addresses. We check this before checking direct + // folding because RIP is preferable to non-RIP accesses. + if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP && + // Under X86-64 non-small code model, GV (and friends) are 64-bits, so + // they cannot be folded into immediate fields. + // FIXME: This can be improved for kernel and other models? + (M == CodeModel::Small || M == CodeModel::Kernel)) { + // Base and index reg must be 0 in order to use %rip as base. + if (AM.hasBaseOrIndexReg()) + return true; + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) { + X86ISelAddressMode Backup = AM; + AM.GV = G->getGlobal(); + AM.SymbolFlags = G->getTargetFlags(); + if (foldOffsetIntoAddress(G->getOffset(), AM)) { + AM = Backup; + return true; + } + } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) { + X86ISelAddressMode Backup = AM; + AM.CP = CP->getConstVal(); + AM.Align = CP->getAlignment(); + AM.SymbolFlags = CP->getTargetFlags(); + if (foldOffsetIntoAddress(CP->getOffset(), AM)) { + AM = Backup; + return true; + } + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) { + AM.ES = S->getSymbol(); + AM.SymbolFlags = S->getTargetFlags(); + } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) { + AM.MCSym = S->getMCSymbol(); + } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) { + AM.JT = J->getIndex(); + AM.SymbolFlags = J->getTargetFlags(); + } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) { + X86ISelAddressMode Backup = AM; + AM.BlockAddr = BA->getBlockAddress(); + AM.SymbolFlags = BA->getTargetFlags(); + if (foldOffsetIntoAddress(BA->getOffset(), AM)) { + AM = Backup; + return true; + } + } else + llvm_unreachable("Unhandled symbol reference node."); + + if (N.getOpcode() == X86ISD::WrapperRIP) + AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64)); + return false; + } + + // Handle the case when globals fit in our immediate field: This is true for + // X86-32 always and X86-64 when in -mcmodel=small mode. In 64-bit + // mode, this only applies to a non-RIP-relative computation. + if (!Subtarget->is64Bit() || + M == CodeModel::Small || M == CodeModel::Kernel) { + assert(N.getOpcode() != X86ISD::WrapperRIP && + "RIP-relative addressing already handled"); + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) { + AM.GV = G->getGlobal(); + AM.Disp += G->getOffset(); + AM.SymbolFlags = G->getTargetFlags(); + } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) { + AM.CP = CP->getConstVal(); + AM.Align = CP->getAlignment(); + AM.Disp += CP->getOffset(); + AM.SymbolFlags = CP->getTargetFlags(); + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) { + AM.ES = S->getSymbol(); + AM.SymbolFlags = S->getTargetFlags(); + } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) { + AM.MCSym = S->getMCSymbol(); + } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) { + AM.JT = J->getIndex(); + AM.SymbolFlags = J->getTargetFlags(); + } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) { + AM.BlockAddr = BA->getBlockAddress(); + AM.Disp += BA->getOffset(); + AM.SymbolFlags = BA->getTargetFlags(); + } else + llvm_unreachable("Unhandled symbol reference node."); + return false; + } + + return true; +} + +/// Add the specified node to the specified addressing mode, returning true if +/// it cannot be done. This just pattern matches for the addressing mode. +bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) { + if (matchAddressRecursively(N, AM, 0)) + return true; + + // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has + // a smaller encoding and avoids a scaled-index. + if (AM.Scale == 2 && + AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base_Reg.getNode() == nullptr) { + AM.Base_Reg = AM.IndexReg; + AM.Scale = 1; + } + + // Post-processing: Convert foo to foo(%rip), even in non-PIC mode, + // because it has a smaller encoding. + // TODO: Which other code models can use this? + if (TM.getCodeModel() == CodeModel::Small && + Subtarget->is64Bit() && + AM.Scale == 1 && + AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base_Reg.getNode() == nullptr && + AM.IndexReg.getNode() == nullptr && + AM.SymbolFlags == X86II::MO_NO_FLAG && + AM.hasSymbolicDisplacement()) + AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64); + + return false; +} + +bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM, + unsigned Depth) { + // Add an artificial use to this node so that we can keep track of + // it if it gets CSE'd with a different node. + HandleSDNode Handle(N); + + X86ISelAddressMode Backup = AM; + if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) && + !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)) + return false; + AM = Backup; + + // Try again after commuting the operands. + if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) && + !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1)) + return false; + AM = Backup; + + // If we couldn't fold both operands into the address at the same time, + // see if we can just put each operand into a register and fold at least + // the add. + if (AM.BaseType == X86ISelAddressMode::RegBase && + !AM.Base_Reg.getNode() && + !AM.IndexReg.getNode()) { + N = Handle.getValue(); + AM.Base_Reg = N.getOperand(0); + AM.IndexReg = N.getOperand(1); + AM.Scale = 1; + return false; + } + N = Handle.getValue(); + return true; +} + +// Insert a node into the DAG at least before the Pos node's position. This +// will reposition the node as needed, and will assign it a node ID that is <= +// the Pos node's ID. Note that this does *not* preserve the uniqueness of node +// IDs! The selection DAG must no longer depend on their uniqueness when this +// is used. +static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { + if (N.getNode()->getNodeId() == -1 || + N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) { + DAG.RepositionNode(Pos.getNode()->getIterator(), N.getNode()); + N.getNode()->setNodeId(Pos.getNode()->getNodeId()); + } +} + +// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if +// safe. This allows us to convert the shift and and into an h-register +// extract and a scaled index. Returns false if the simplification is +// performed. +static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, + uint64_t Mask, + SDValue Shift, SDValue X, + X86ISelAddressMode &AM) { + if (Shift.getOpcode() != ISD::SRL || + !isa<ConstantSDNode>(Shift.getOperand(1)) || + !Shift.hasOneUse()) + return true; + + int ScaleLog = 8 - Shift.getConstantOperandVal(1); + if (ScaleLog <= 0 || ScaleLog >= 4 || + Mask != (0xffu << ScaleLog)) + return true; + + MVT VT = N.getSimpleValueType(); + SDLoc DL(N); + SDValue Eight = DAG.getConstant(8, DL, MVT::i8); + SDValue NewMask = DAG.getConstant(0xff, DL, VT); + SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight); + SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask); + SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8); + SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount); + + // Insert the new nodes into the topological ordering. We must do this in + // a valid topological ordering as nothing is going to go back and re-sort + // these nodes. We continually insert before 'N' in sequence as this is + // essentially a pre-flattened and pre-sorted sequence of nodes. There is no + // hierarchy left to express. + insertDAGNode(DAG, N, Eight); + insertDAGNode(DAG, N, Srl); + insertDAGNode(DAG, N, NewMask); + insertDAGNode(DAG, N, And); + insertDAGNode(DAG, N, ShlCount); + insertDAGNode(DAG, N, Shl); + DAG.ReplaceAllUsesWith(N, Shl); + AM.IndexReg = And; + AM.Scale = (1 << ScaleLog); + return false; +} + +// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this +// allows us to fold the shift into this addressing mode. Returns false if the +// transform succeeded. +static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, + uint64_t Mask, + SDValue Shift, SDValue X, + X86ISelAddressMode &AM) { + if (Shift.getOpcode() != ISD::SHL || + !isa<ConstantSDNode>(Shift.getOperand(1))) + return true; + + // Not likely to be profitable if either the AND or SHIFT node has more + // than one use (unless all uses are for address computation). Besides, + // isel mechanism requires their node ids to be reused. + if (!N.hasOneUse() || !Shift.hasOneUse()) + return true; + + // Verify that the shift amount is something we can fold. + unsigned ShiftAmt = Shift.getConstantOperandVal(1); + if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3) + return true; + + MVT VT = N.getSimpleValueType(); + SDLoc DL(N); + SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT); + SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask); + SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1)); + + // Insert the new nodes into the topological ordering. We must do this in + // a valid topological ordering as nothing is going to go back and re-sort + // these nodes. We continually insert before 'N' in sequence as this is + // essentially a pre-flattened and pre-sorted sequence of nodes. There is no + // hierarchy left to express. + insertDAGNode(DAG, N, NewMask); + insertDAGNode(DAG, N, NewAnd); + insertDAGNode(DAG, N, NewShift); + DAG.ReplaceAllUsesWith(N, NewShift); + + AM.Scale = 1 << ShiftAmt; + AM.IndexReg = NewAnd; + return false; +} + +// Implement some heroics to detect shifts of masked values where the mask can +// be replaced by extending the shift and undoing that in the addressing mode +// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and +// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in +// the addressing mode. This results in code such as: +// +// int f(short *y, int *lookup_table) { +// ... +// return *y + lookup_table[*y >> 11]; +// } +// +// Turning into: +// movzwl (%rdi), %eax +// movl %eax, %ecx +// shrl $11, %ecx +// addl (%rsi,%rcx,4), %eax +// +// Instead of: +// movzwl (%rdi), %eax +// movl %eax, %ecx +// shrl $9, %ecx +// andl $124, %rcx +// addl (%rsi,%rcx), %eax +// +// Note that this function assumes the mask is provided as a mask *after* the +// value is shifted. The input chain may or may not match that, but computing +// such a mask is trivial. +static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, + uint64_t Mask, + SDValue Shift, SDValue X, + X86ISelAddressMode &AM) { + if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() || + !isa<ConstantSDNode>(Shift.getOperand(1))) + return true; + + unsigned ShiftAmt = Shift.getConstantOperandVal(1); + unsigned MaskLZ = countLeadingZeros(Mask); + unsigned MaskTZ = countTrailingZeros(Mask); + + // The amount of shift we're trying to fit into the addressing mode is taken + // from the trailing zeros of the mask. + unsigned AMShiftAmt = MaskTZ; + + // There is nothing we can do here unless the mask is removing some bits. + // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. + if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true; + + // We also need to ensure that mask is a continuous run of bits. + if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true; + + // Scale the leading zero count down based on the actual size of the value. + // Also scale it down based on the size of the shift. + MaskLZ -= (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt; + + // The final check is to ensure that any masked out high bits of X are + // already known to be zero. Otherwise, the mask has a semantic impact + // other than masking out a couple of low bits. Unfortunately, because of + // the mask, zero extensions will be removed from operands in some cases. + // This code works extra hard to look through extensions because we can + // replace them with zero extensions cheaply if necessary. + bool ReplacingAnyExtend = false; + if (X.getOpcode() == ISD::ANY_EXTEND) { + unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() - + X.getOperand(0).getSimpleValueType().getSizeInBits(); + // Assume that we'll replace the any-extend with a zero-extend, and + // narrow the search to the extended value. + X = X.getOperand(0); + MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits; + ReplacingAnyExtend = true; + } + APInt MaskedHighBits = + APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ); + APInt KnownZero, KnownOne; + DAG.computeKnownBits(X, KnownZero, KnownOne); + if (MaskedHighBits != KnownZero) return true; + + // We've identified a pattern that can be transformed into a single shift + // and an addressing mode. Make it so. + MVT VT = N.getSimpleValueType(); + if (ReplacingAnyExtend) { + assert(X.getValueType() != VT); + // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. + SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X); + insertDAGNode(DAG, N, NewX); + X = NewX; + } + SDLoc DL(N); + SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); + SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); + SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); + SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt); + + // Insert the new nodes into the topological ordering. We must do this in + // a valid topological ordering as nothing is going to go back and re-sort + // these nodes. We continually insert before 'N' in sequence as this is + // essentially a pre-flattened and pre-sorted sequence of nodes. There is no + // hierarchy left to express. + insertDAGNode(DAG, N, NewSRLAmt); + insertDAGNode(DAG, N, NewSRL); + insertDAGNode(DAG, N, NewSHLAmt); + insertDAGNode(DAG, N, NewSHL); + DAG.ReplaceAllUsesWith(N, NewSHL); + + AM.Scale = 1 << AMShiftAmt; + AM.IndexReg = NewSRL; + return false; +} + +bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, + unsigned Depth) { + SDLoc dl(N); + DEBUG({ + dbgs() << "MatchAddress: "; + AM.dump(); + }); + // Limit recursion. + if (Depth > 5) + return matchAddressBase(N, AM); + + // If this is already a %rip relative address, we can only merge immediates + // into it. Instead of handling this in every case, we handle it here. + // RIP relative addressing: %rip + 32-bit displacement! + if (AM.isRIPRelative()) { + // FIXME: JumpTable and ExternalSymbol address currently don't like + // displacements. It isn't very important, but this should be fixed for + // consistency. + if (!(AM.ES || AM.MCSym) && AM.JT != -1) + return true; + + if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N)) + if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM)) + return false; + return true; + } + + switch (N.getOpcode()) { + default: break; + case ISD::LOCAL_RECOVER: { + if (!AM.hasSymbolicDisplacement() && AM.Disp == 0) + if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) { + // Use the symbol and don't prefix it. + AM.MCSym = ESNode->getMCSymbol(); + return false; + } + break; + } + case ISD::Constant: { + uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); + if (!foldOffsetIntoAddress(Val, AM)) + return false; + break; + } + + case X86ISD::Wrapper: + case X86ISD::WrapperRIP: + if (!matchWrapper(N, AM)) + return false; + break; + + case ISD::LOAD: + if (!matchLoadInAddress(cast<LoadSDNode>(N), AM)) + return false; + break; + + case ISD::FrameIndex: + if (AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base_Reg.getNode() == nullptr && + (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) { + AM.BaseType = X86ISelAddressMode::FrameIndexBase; + AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex(); + return false; + } + break; + + case ISD::SHL: + if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) + break; + + if (ConstantSDNode + *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) { + unsigned Val = CN->getZExtValue(); + // Note that we handle x<<1 as (,x,2) rather than (x,x) here so + // that the base operand remains free for further matching. If + // the base doesn't end up getting used, a post-processing step + // in MatchAddress turns (,x,2) into (x,x), which is cheaper. + if (Val == 1 || Val == 2 || Val == 3) { + AM.Scale = 1 << Val; + SDValue ShVal = N.getNode()->getOperand(0); + + // Okay, we know that we have a scale by now. However, if the scaled + // value is an add of something and a constant, we can fold the + // constant into the disp field here. + if (CurDAG->isBaseWithConstantOffset(ShVal)) { + AM.IndexReg = ShVal.getNode()->getOperand(0); + ConstantSDNode *AddVal = + cast<ConstantSDNode>(ShVal.getNode()->getOperand(1)); + uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val; + if (!foldOffsetIntoAddress(Disp, AM)) + return false; + } + + AM.IndexReg = ShVal; + return false; + } + } + break; + + case ISD::SRL: { + // Scale must not be used already. + if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; + + SDValue And = N.getOperand(0); + if (And.getOpcode() != ISD::AND) break; + SDValue X = And.getOperand(0); + + // We only handle up to 64-bit values here as those are what matter for + // addressing mode optimizations. + if (X.getSimpleValueType().getSizeInBits() > 64) break; + + // The mask used for the transform is expected to be post-shift, but we + // found the shift first so just apply the shift to the mask before passing + // it down. + if (!isa<ConstantSDNode>(N.getOperand(1)) || + !isa<ConstantSDNode>(And.getOperand(1))) + break; + uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1); + + // Try to fold the mask and shift into the scale, and return false if we + // succeed. + if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM)) + return false; + break; + } + + case ISD::SMUL_LOHI: + case ISD::UMUL_LOHI: + // A mul_lohi where we need the low part can be folded as a plain multiply. + if (N.getResNo() != 0) break; + // FALL THROUGH + case ISD::MUL: + case X86ISD::MUL_IMM: + // X*[3,5,9] -> X+X*[2,4,8] + if (AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base_Reg.getNode() == nullptr && + AM.IndexReg.getNode() == nullptr) { + if (ConstantSDNode + *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) + if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 || + CN->getZExtValue() == 9) { + AM.Scale = unsigned(CN->getZExtValue())-1; + + SDValue MulVal = N.getNode()->getOperand(0); + SDValue Reg; + + // Okay, we know that we have a scale by now. However, if the scaled + // value is an add of something and a constant, we can fold the + // constant into the disp field here. + if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() && + isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) { + Reg = MulVal.getNode()->getOperand(0); + ConstantSDNode *AddVal = + cast<ConstantSDNode>(MulVal.getNode()->getOperand(1)); + uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); + if (foldOffsetIntoAddress(Disp, AM)) + Reg = N.getNode()->getOperand(0); + } else { + Reg = N.getNode()->getOperand(0); + } + + AM.IndexReg = AM.Base_Reg = Reg; + return false; + } + } + break; + + case ISD::SUB: { + // Given A-B, if A can be completely folded into the address and + // the index field with the index field unused, use -B as the index. + // This is a win if a has multiple parts that can be folded into + // the address. Also, this saves a mov if the base register has + // other uses, since it avoids a two-address sub instruction, however + // it costs an additional mov if the index register has other uses. + + // Add an artificial use to this node so that we can keep track of + // it if it gets CSE'd with a different node. + HandleSDNode Handle(N); + + // Test if the LHS of the sub can be folded. + X86ISelAddressMode Backup = AM; + if (matchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) { + AM = Backup; + break; + } + // Test if the index field is free for use. + if (AM.IndexReg.getNode() || AM.isRIPRelative()) { + AM = Backup; + break; + } + + int Cost = 0; + SDValue RHS = Handle.getValue().getNode()->getOperand(1); + // If the RHS involves a register with multiple uses, this + // transformation incurs an extra mov, due to the neg instruction + // clobbering its operand. + if (!RHS.getNode()->hasOneUse() || + RHS.getNode()->getOpcode() == ISD::CopyFromReg || + RHS.getNode()->getOpcode() == ISD::TRUNCATE || + RHS.getNode()->getOpcode() == ISD::ANY_EXTEND || + (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND && + RHS.getNode()->getOperand(0).getValueType() == MVT::i32)) + ++Cost; + // If the base is a register with multiple uses, this + // transformation may save a mov. + if ((AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base_Reg.getNode() && + !AM.Base_Reg.getNode()->hasOneUse()) || + AM.BaseType == X86ISelAddressMode::FrameIndexBase) + --Cost; + // If the folded LHS was interesting, this transformation saves + // address arithmetic. + if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) + + ((AM.Disp != 0) && (Backup.Disp == 0)) + + (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2) + --Cost; + // If it doesn't look like it may be an overall win, don't do it. + if (Cost >= 0) { + AM = Backup; + break; + } + + // Ok, the transformation is legal and appears profitable. Go for it. + SDValue Zero = CurDAG->getConstant(0, dl, N.getValueType()); + SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS); + AM.IndexReg = Neg; + AM.Scale = 1; + + // Insert the new nodes into the topological ordering. + insertDAGNode(*CurDAG, N, Zero); + insertDAGNode(*CurDAG, N, Neg); + return false; + } + + case ISD::ADD: + if (!matchAdd(N, AM, Depth)) + return false; + break; + + case ISD::OR: + // We want to look through a transform in InstCombine and DAGCombiner that + // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'. + // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3)) + // An 'lea' can then be used to match the shift (multiply) and add: + // and $1, %esi + // lea (%rsi, %rdi, 8), %rax + if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) && + !matchAdd(N, AM, Depth)) + return false; + break; + + case ISD::AND: { + // Perform some heroic transforms on an and of a constant-count shift + // with a constant to enable use of the scaled offset field. + + // Scale must not be used already. + if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; + + SDValue Shift = N.getOperand(0); + if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break; + SDValue X = Shift.getOperand(0); + + // We only handle up to 64-bit values here as those are what matter for + // addressing mode optimizations. + if (X.getSimpleValueType().getSizeInBits() > 64) break; + + if (!isa<ConstantSDNode>(N.getOperand(1))) + break; + uint64_t Mask = N.getConstantOperandVal(1); + + // Try to fold the mask and shift into an extract and scale. + if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) + return false; + + // Try to fold the mask and shift directly into the scale. + if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) + return false; + + // Try to swap the mask and shift to place shifts which can be done as + // a scale on the outside of the mask. + if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM)) + return false; + break; + } + } + + return matchAddressBase(N, AM); +} + +/// Helper for MatchAddress. Add the specified node to the +/// specified addressing mode without any further recursion. +bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) { + // Is the base register already occupied? + if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) { + // If so, check to see if the scale index register is set. + if (!AM.IndexReg.getNode()) { + AM.IndexReg = N; + AM.Scale = 1; + return false; + } + + // Otherwise, we cannot select it. + return true; + } + + // Default, generate it as a register. + AM.BaseType = X86ISelAddressMode::RegBase; + AM.Base_Reg = N; + return false; +} + +bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment) { + + MaskedGatherScatterSDNode *Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent); + if (!Mgs) + return false; + X86ISelAddressMode AM; + unsigned AddrSpace = Mgs->getPointerInfo().getAddrSpace(); + // AddrSpace 256 -> GS, 257 -> FS. + if (AddrSpace == 256) + AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); + if (AddrSpace == 257) + AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); + + SDLoc DL(N); + Base = Mgs->getBasePtr(); + Index = Mgs->getIndex(); + unsigned ScalarSize = Mgs->getValue().getValueType().getScalarSizeInBits(); + Scale = getI8Imm(ScalarSize/8, DL); + + // If Base is 0, the whole address is in index and the Scale is 1 + if (isa<ConstantSDNode>(Base)) { + assert(cast<ConstantSDNode>(Base)->isNullValue() && + "Unexpected base in gather/scatter"); + Scale = getI8Imm(1, DL); + Base = CurDAG->getRegister(0, MVT::i32); + } + if (AM.Segment.getNode()) + Segment = AM.Segment; + else + Segment = CurDAG->getRegister(0, MVT::i32); + Disp = CurDAG->getTargetConstant(0, DL, MVT::i32); + return true; +} + +/// Returns true if it is able to pattern match an addressing mode. +/// It returns the operands which make up the maximal addressing mode it can +/// match by reference. +/// +/// Parent is the parent node of the addr operand that is being matched. It +/// is always a load, store, atomic node, or null. It is only null when +/// checking memory operands for inline asm nodes. +bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment) { + X86ISelAddressMode AM; + + if (Parent && + // This list of opcodes are all the nodes that have an "addr:$ptr" operand + // that are not a MemSDNode, and thus don't have proper addrspace info. + Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme + Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores + Parent->getOpcode() != X86ISD::TLSCALL && // Fixme + Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp + Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp + unsigned AddrSpace = + cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace(); + // AddrSpace 256 -> GS, 257 -> FS. + if (AddrSpace == 256) + AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); + if (AddrSpace == 257) + AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); + } + + if (matchAddress(N, AM)) + return false; + + MVT VT = N.getSimpleValueType(); + if (AM.BaseType == X86ISelAddressMode::RegBase) { + if (!AM.Base_Reg.getNode()) + AM.Base_Reg = CurDAG->getRegister(0, VT); + } + + if (!AM.IndexReg.getNode()) + AM.IndexReg = CurDAG->getRegister(0, VT); + + getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment); + return true; +} + +/// Match a scalar SSE load. In particular, we want to match a load whose top +/// elements are either undef or zeros. The load flavor is derived from the +/// type of N, which is either v4f32 or v2f64. +/// +/// We also return: +/// PatternChainNode: this is the matched node that has a chain input and +/// output. +bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, + SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment, + SDValue &PatternNodeWithChain) { + if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) { + PatternNodeWithChain = N.getOperand(0); + if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) && + PatternNodeWithChain.hasOneUse() && + IsProfitableToFold(N.getOperand(0), N.getNode(), Root) && + IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) { + LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); + if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) + return false; + return true; + } + } + + // Also handle the case where we explicitly require zeros in the top + // elements. This is a vector shuffle from the zero vector. + if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() && + // Check to see if the top elements are all zeros (or bitcast of zeros). + N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && + N.getOperand(0).getNode()->hasOneUse() && + ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).getNode()) && + N.getOperand(0).getOperand(0).hasOneUse() && + IsProfitableToFold(N.getOperand(0), N.getNode(), Root) && + IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) { + // Okay, this is a zero extending load. Fold it. + LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0)); + if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) + return false; + PatternNodeWithChain = SDValue(LD, 0); + return true; + } + return false; +} + + +bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { + if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) { + uint64_t ImmVal = CN->getZExtValue(); + if ((uint32_t)ImmVal != (uint64_t)ImmVal) + return false; + + Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64); + return true; + } + + // In static codegen with small code model, we can get the address of a label + // into a register with 'movl'. TableGen has already made sure we're looking + // at a label of some kind. + assert(N->getOpcode() == X86ISD::Wrapper && + "Unexpected node type for MOV32ri64"); + N = N.getOperand(0); + + if (N->getOpcode() != ISD::TargetConstantPool && + N->getOpcode() != ISD::TargetJumpTable && + N->getOpcode() != ISD::TargetGlobalAddress && + N->getOpcode() != ISD::TargetExternalSymbol && + N->getOpcode() != ISD::MCSymbol && + N->getOpcode() != ISD::TargetBlockAddress) + return false; + + Imm = N; + return TM.getCodeModel() == CodeModel::Small; +} + +bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment) { + if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment)) + return false; + + SDLoc DL(N); + RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base); + if (RN && RN->getReg() == 0) + Base = CurDAG->getRegister(0, MVT::i64); + else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(Base)) { + // Base could already be %rip, particularly in the x32 ABI. + Base = SDValue(CurDAG->getMachineNode( + TargetOpcode::SUBREG_TO_REG, DL, MVT::i64, + CurDAG->getTargetConstant(0, DL, MVT::i64), + Base, + CurDAG->getTargetConstant(X86::sub_32bit, DL, MVT::i32)), + 0); + } + + RN = dyn_cast<RegisterSDNode>(Index); + if (RN && RN->getReg() == 0) + Index = CurDAG->getRegister(0, MVT::i64); + else { + assert(Index.getValueType() == MVT::i32 && + "Expect to be extending 32-bit registers for use in LEA"); + Index = SDValue(CurDAG->getMachineNode( + TargetOpcode::SUBREG_TO_REG, DL, MVT::i64, + CurDAG->getTargetConstant(0, DL, MVT::i64), + Index, + CurDAG->getTargetConstant(X86::sub_32bit, DL, + MVT::i32)), + 0); + } + + return true; +} + +/// Calls SelectAddr and determines if the maximal addressing +/// mode it matches can be cost effectively emitted as an LEA instruction. +bool X86DAGToDAGISel::selectLEAAddr(SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment) { + X86ISelAddressMode AM; + + // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support + // segments. + SDValue Copy = AM.Segment; + SDValue T = CurDAG->getRegister(0, MVT::i32); + AM.Segment = T; + if (matchAddress(N, AM)) + return false; + assert (T == AM.Segment); + AM.Segment = Copy; + + MVT VT = N.getSimpleValueType(); + unsigned Complexity = 0; + if (AM.BaseType == X86ISelAddressMode::RegBase) + if (AM.Base_Reg.getNode()) + Complexity = 1; + else + AM.Base_Reg = CurDAG->getRegister(0, VT); + else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) + Complexity = 4; + + if (AM.IndexReg.getNode()) + Complexity++; + else + AM.IndexReg = CurDAG->getRegister(0, VT); + + // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with + // a simple shift. + if (AM.Scale > 1) + Complexity++; + + // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA + // to a LEA. This is determined with some experimentation but is by no means + // optimal (especially for code size consideration). LEA is nice because of + // its three-address nature. Tweak the cost function again when we can run + // convertToThreeAddress() at register allocation time. + if (AM.hasSymbolicDisplacement()) { + // For X86-64, always use LEA to materialize RIP-relative addresses. + if (Subtarget->is64Bit()) + Complexity = 4; + else + Complexity += 2; + } + + if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode())) + Complexity++; + + // If it isn't worth using an LEA, reject it. + if (Complexity <= 2) + return false; + + getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment); + return true; +} + +/// This is only run on TargetGlobalTLSAddress nodes. +bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment) { + assert(N.getOpcode() == ISD::TargetGlobalTLSAddress); + const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N); + + X86ISelAddressMode AM; + AM.GV = GA->getGlobal(); + AM.Disp += GA->getOffset(); + AM.Base_Reg = CurDAG->getRegister(0, N.getValueType()); + AM.SymbolFlags = GA->getTargetFlags(); + + if (N.getValueType() == MVT::i32) { + AM.Scale = 1; + AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32); + } else { + AM.IndexReg = CurDAG->getRegister(0, MVT::i64); + } + + getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment); + return true; +} + + +bool X86DAGToDAGISel::tryFoldLoad(SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment) { + if (!ISD::isNON_EXTLoad(N.getNode()) || + !IsProfitableToFold(N, P, P) || + !IsLegalToFold(N, P, P, OptLevel)) + return false; + + return selectAddr(N.getNode(), + N.getOperand(1), Base, Scale, Index, Disp, Segment); +} + +/// Return an SDNode that returns the value of the global base register. +/// Output instructions required to initialize the global base register, +/// if necessary. +SDNode *X86DAGToDAGISel::getGlobalBaseReg() { + unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); + auto &DL = MF->getDataLayout(); + return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode(); +} + +/// Atomic opcode table +/// +enum AtomicOpc { + ADD, + SUB, + INC, + DEC, + OR, + AND, + XOR, + AtomicOpcEnd +}; + +enum AtomicSz { + ConstantI8, + I8, + SextConstantI16, + ConstantI16, + I16, + SextConstantI32, + ConstantI32, + I32, + SextConstantI64, + ConstantI64, + I64, + AtomicSzEnd +}; + +static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = { + { + X86::LOCK_ADD8mi, + X86::LOCK_ADD8mr, + X86::LOCK_ADD16mi8, + X86::LOCK_ADD16mi, + X86::LOCK_ADD16mr, + X86::LOCK_ADD32mi8, + X86::LOCK_ADD32mi, + X86::LOCK_ADD32mr, + X86::LOCK_ADD64mi8, + X86::LOCK_ADD64mi32, + X86::LOCK_ADD64mr, + }, + { + X86::LOCK_SUB8mi, + X86::LOCK_SUB8mr, + X86::LOCK_SUB16mi8, + X86::LOCK_SUB16mi, + X86::LOCK_SUB16mr, + X86::LOCK_SUB32mi8, + X86::LOCK_SUB32mi, + X86::LOCK_SUB32mr, + X86::LOCK_SUB64mi8, + X86::LOCK_SUB64mi32, + X86::LOCK_SUB64mr, + }, + { + 0, + X86::LOCK_INC8m, + 0, + 0, + X86::LOCK_INC16m, + 0, + 0, + X86::LOCK_INC32m, + 0, + 0, + X86::LOCK_INC64m, + }, + { + 0, + X86::LOCK_DEC8m, + 0, + 0, + X86::LOCK_DEC16m, + 0, + 0, + X86::LOCK_DEC32m, + 0, + 0, + X86::LOCK_DEC64m, + }, + { + X86::LOCK_OR8mi, + X86::LOCK_OR8mr, + X86::LOCK_OR16mi8, + X86::LOCK_OR16mi, + X86::LOCK_OR16mr, + X86::LOCK_OR32mi8, + X86::LOCK_OR32mi, + X86::LOCK_OR32mr, + X86::LOCK_OR64mi8, + X86::LOCK_OR64mi32, + X86::LOCK_OR64mr, + }, + { + X86::LOCK_AND8mi, + X86::LOCK_AND8mr, + X86::LOCK_AND16mi8, + X86::LOCK_AND16mi, + X86::LOCK_AND16mr, + X86::LOCK_AND32mi8, + X86::LOCK_AND32mi, + X86::LOCK_AND32mr, + X86::LOCK_AND64mi8, + X86::LOCK_AND64mi32, + X86::LOCK_AND64mr, + }, + { + X86::LOCK_XOR8mi, + X86::LOCK_XOR8mr, + X86::LOCK_XOR16mi8, + X86::LOCK_XOR16mi, + X86::LOCK_XOR16mr, + X86::LOCK_XOR32mi8, + X86::LOCK_XOR32mi, + X86::LOCK_XOR32mr, + X86::LOCK_XOR64mi8, + X86::LOCK_XOR64mi32, + X86::LOCK_XOR64mr, + } +}; + +// Return the target constant operand for atomic-load-op and do simple +// translations, such as from atomic-load-add to lock-sub. The return value is +// one of the following 3 cases: +// + target-constant, the operand could be supported as a target constant. +// + empty, the operand is not needed any more with the new op selected. +// + non-empty, otherwise. +static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG, + SDLoc dl, + enum AtomicOpc &Op, MVT NVT, + SDValue Val, + const X86Subtarget *Subtarget) { + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) { + int64_t CNVal = CN->getSExtValue(); + // Quit if not 32-bit imm. + if ((int32_t)CNVal != CNVal) + return Val; + // Quit if INT32_MIN: it would be negated as it is negative and overflow, + // producing an immediate that does not fit in the 32 bits available for + // an immediate operand to sub. However, it still fits in 32 bits for the + // add (since it is not negated) so we can return target-constant. + if (CNVal == INT32_MIN) + return CurDAG->getTargetConstant(CNVal, dl, NVT); + // For atomic-load-add, we could do some optimizations. + if (Op == ADD) { + // Translate to INC/DEC if ADD by 1 or -1. + if (((CNVal == 1) || (CNVal == -1)) && !Subtarget->slowIncDec()) { + Op = (CNVal == 1) ? INC : DEC; + // No more constant operand after being translated into INC/DEC. + return SDValue(); + } + // Translate to SUB if ADD by negative value. + if (CNVal < 0) { + Op = SUB; + CNVal = -CNVal; + } + } + return CurDAG->getTargetConstant(CNVal, dl, NVT); + } + + // If the value operand is single-used, try to optimize it. + if (Op == ADD && Val.hasOneUse()) { + // Translate (atomic-load-add ptr (sub 0 x)) back to (lock-sub x). + if (Val.getOpcode() == ISD::SUB && X86::isZeroNode(Val.getOperand(0))) { + Op = SUB; + return Val.getOperand(1); + } + // A special case for i16, which needs truncating as, in most cases, it's + // promoted to i32. We will translate + // (atomic-load-add (truncate (sub 0 x))) to (lock-sub (EXTRACT_SUBREG x)) + if (Val.getOpcode() == ISD::TRUNCATE && NVT == MVT::i16 && + Val.getOperand(0).getOpcode() == ISD::SUB && + X86::isZeroNode(Val.getOperand(0).getOperand(0))) { + Op = SUB; + Val = Val.getOperand(0); + return CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, NVT, + Val.getOperand(1)); + } + } + + return Val; +} + +SDNode *X86DAGToDAGISel::selectAtomicLoadArith(SDNode *Node, MVT NVT) { + if (Node->hasAnyUseOfValue(0)) + return nullptr; + + SDLoc dl(Node); + + // Optimize common patterns for __sync_or_and_fetch and similar arith + // operations where the result is not used. This allows us to use the "lock" + // version of the arithmetic instruction. + SDValue Chain = Node->getOperand(0); + SDValue Ptr = Node->getOperand(1); + SDValue Val = Node->getOperand(2); + SDValue Base, Scale, Index, Disp, Segment; + if (!selectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment)) + return nullptr; + + // Which index into the table. + enum AtomicOpc Op; + switch (Node->getOpcode()) { + default: + return nullptr; + case ISD::ATOMIC_LOAD_OR: + Op = OR; + break; + case ISD::ATOMIC_LOAD_AND: + Op = AND; + break; + case ISD::ATOMIC_LOAD_XOR: + Op = XOR; + break; + case ISD::ATOMIC_LOAD_ADD: + Op = ADD; + break; + } + + Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val, Subtarget); + bool isUnOp = !Val.getNode(); + bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant); + + unsigned Opc = 0; + switch (NVT.SimpleTy) { + default: return nullptr; + case MVT::i8: + if (isCN) + Opc = AtomicOpcTbl[Op][ConstantI8]; + else + Opc = AtomicOpcTbl[Op][I8]; + break; + case MVT::i16: + if (isCN) { + if (immSext8(Val.getNode())) + Opc = AtomicOpcTbl[Op][SextConstantI16]; + else + Opc = AtomicOpcTbl[Op][ConstantI16]; + } else + Opc = AtomicOpcTbl[Op][I16]; + break; + case MVT::i32: + if (isCN) { + if (immSext8(Val.getNode())) + Opc = AtomicOpcTbl[Op][SextConstantI32]; + else + Opc = AtomicOpcTbl[Op][ConstantI32]; + } else + Opc = AtomicOpcTbl[Op][I32]; + break; + case MVT::i64: + if (isCN) { + if (immSext8(Val.getNode())) + Opc = AtomicOpcTbl[Op][SextConstantI64]; + else if (i64immSExt32(Val.getNode())) + Opc = AtomicOpcTbl[Op][ConstantI64]; + else + llvm_unreachable("True 64 bits constant in SelectAtomicLoadArith"); + } else + Opc = AtomicOpcTbl[Op][I64]; + break; + } + + assert(Opc != 0 && "Invalid arith lock transform!"); + + // Building the new node. + SDValue Ret; + if (isUnOp) { + SDValue Ops[] = { Base, Scale, Index, Disp, Segment, Chain }; + Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0); + } else { + SDValue Ops[] = { Base, Scale, Index, Disp, Segment, Val, Chain }; + Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0); + } + + // Copying the MachineMemOperand. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); + cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1); + + // We need to have two outputs as that is what the original instruction had. + // So we add a dummy, undefined output. This is safe as we checked first + // that no-one uses our output anyway. + SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + dl, NVT), 0); + SDValue RetVals[] = { Undef, Ret }; + return CurDAG->getMergeValues(RetVals, dl).getNode(); +} + +/// Test whether the given X86ISD::CMP node has any uses which require the SF +/// or OF bits to be accurate. +static bool hasNoSignedComparisonUses(SDNode *N) { + // Examine each user of the node. + for (SDNode::use_iterator UI = N->use_begin(), + UE = N->use_end(); UI != UE; ++UI) { + // Only examine CopyToReg uses. + if (UI->getOpcode() != ISD::CopyToReg) + return false; + // Only examine CopyToReg uses that copy to EFLAGS. + if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != + X86::EFLAGS) + return false; + // Examine each user of the CopyToReg use. + for (SDNode::use_iterator FlagUI = UI->use_begin(), + FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { + // Only examine the Flag result. + if (FlagUI.getUse().getResNo() != 1) continue; + // Anything unusual: assume conservatively. + if (!FlagUI->isMachineOpcode()) return false; + // Examine the opcode of the user. + switch (FlagUI->getMachineOpcode()) { + // These comparisons don't treat the most significant bit specially. + case X86::SETAr: case X86::SETAEr: case X86::SETBr: case X86::SETBEr: + case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr: + case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm: + case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm: + case X86::JA_1: case X86::JAE_1: case X86::JB_1: case X86::JBE_1: + case X86::JE_1: case X86::JNE_1: case X86::JP_1: case X86::JNP_1: + case X86::CMOVA16rr: case X86::CMOVA16rm: + case X86::CMOVA32rr: case X86::CMOVA32rm: + case X86::CMOVA64rr: case X86::CMOVA64rm: + case X86::CMOVAE16rr: case X86::CMOVAE16rm: + case X86::CMOVAE32rr: case X86::CMOVAE32rm: + case X86::CMOVAE64rr: case X86::CMOVAE64rm: + case X86::CMOVB16rr: case X86::CMOVB16rm: + case X86::CMOVB32rr: case X86::CMOVB32rm: + case X86::CMOVB64rr: case X86::CMOVB64rm: + case X86::CMOVBE16rr: case X86::CMOVBE16rm: + case X86::CMOVBE32rr: case X86::CMOVBE32rm: + case X86::CMOVBE64rr: case X86::CMOVBE64rm: + case X86::CMOVE16rr: case X86::CMOVE16rm: + case X86::CMOVE32rr: case X86::CMOVE32rm: + case X86::CMOVE64rr: case X86::CMOVE64rm: + case X86::CMOVNE16rr: case X86::CMOVNE16rm: + case X86::CMOVNE32rr: case X86::CMOVNE32rm: + case X86::CMOVNE64rr: case X86::CMOVNE64rm: + case X86::CMOVNP16rr: case X86::CMOVNP16rm: + case X86::CMOVNP32rr: case X86::CMOVNP32rm: + case X86::CMOVNP64rr: case X86::CMOVNP64rm: + case X86::CMOVP16rr: case X86::CMOVP16rm: + case X86::CMOVP32rr: case X86::CMOVP32rm: + case X86::CMOVP64rr: case X86::CMOVP64rm: + continue; + // Anything else: assume conservatively. + default: return false; + } + } + } + return true; +} + +/// Check whether or not the chain ending in StoreNode is suitable for doing +/// the {load; increment or decrement; store} to modify transformation. +static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, + SDValue StoredVal, SelectionDAG *CurDAG, + LoadSDNode* &LoadNode, SDValue &InputChain) { + + // is the value stored the result of a DEC or INC? + if (!(Opc == X86ISD::DEC || Opc == X86ISD::INC)) return false; + + // is the stored value result 0 of the load? + if (StoredVal.getResNo() != 0) return false; + + // are there other uses of the loaded value than the inc or dec? + if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false; + + // is the store non-extending and non-indexed? + if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal()) + return false; + + SDValue Load = StoredVal->getOperand(0); + // Is the stored value a non-extending and non-indexed load? + if (!ISD::isNormalLoad(Load.getNode())) return false; + + // Return LoadNode by reference. + LoadNode = cast<LoadSDNode>(Load); + // is the size of the value one that we can handle? (i.e. 64, 32, 16, or 8) + EVT LdVT = LoadNode->getMemoryVT(); + if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 && + LdVT != MVT::i8) + return false; + + // Is store the only read of the loaded value? + if (!Load.hasOneUse()) + return false; + + // Is the address of the store the same as the load? + if (LoadNode->getBasePtr() != StoreNode->getBasePtr() || + LoadNode->getOffset() != StoreNode->getOffset()) + return false; + + // Check if the chain is produced by the load or is a TokenFactor with + // the load output chain as an operand. Return InputChain by reference. + SDValue Chain = StoreNode->getChain(); + + bool ChainCheck = false; + if (Chain == Load.getValue(1)) { + ChainCheck = true; + InputChain = LoadNode->getChain(); + } else if (Chain.getOpcode() == ISD::TokenFactor) { + SmallVector<SDValue, 4> ChainOps; + for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { + SDValue Op = Chain.getOperand(i); + if (Op == Load.getValue(1)) { + ChainCheck = true; + continue; + } + + // Make sure using Op as part of the chain would not cause a cycle here. + // In theory, we could check whether the chain node is a predecessor of + // the load. But that can be very expensive. Instead visit the uses and + // make sure they all have smaller node id than the load. + int LoadId = LoadNode->getNodeId(); + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = UI->use_end(); UI != UE; ++UI) { + if (UI.getUse().getResNo() != 0) + continue; + if (UI->getNodeId() > LoadId) + return false; + } + + ChainOps.push_back(Op); + } + + if (ChainCheck) + // Make a new TokenFactor with all the other input chains except + // for the load. + InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), + MVT::Other, ChainOps); + } + if (!ChainCheck) + return false; + + return true; +} + +/// Get the appropriate X86 opcode for an in-memory increment or decrement. +/// Opc should be X86ISD::DEC or X86ISD::INC. +static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) { + if (Opc == X86ISD::DEC) { + if (LdVT == MVT::i64) return X86::DEC64m; + if (LdVT == MVT::i32) return X86::DEC32m; + if (LdVT == MVT::i16) return X86::DEC16m; + if (LdVT == MVT::i8) return X86::DEC8m; + } else { + assert(Opc == X86ISD::INC && "unrecognized opcode"); + if (LdVT == MVT::i64) return X86::INC64m; + if (LdVT == MVT::i32) return X86::INC32m; + if (LdVT == MVT::i16) return X86::INC16m; + if (LdVT == MVT::i8) return X86::INC8m; + } + llvm_unreachable("unrecognized size for LdVT"); +} + +/// Customized ISel for GATHER operations. +SDNode *X86DAGToDAGISel::selectGather(SDNode *Node, unsigned Opc) { + // Operands of Gather: VSrc, Base, VIdx, VMask, Scale + SDValue Chain = Node->getOperand(0); + SDValue VSrc = Node->getOperand(2); + SDValue Base = Node->getOperand(3); + SDValue VIdx = Node->getOperand(4); + SDValue VMask = Node->getOperand(5); + ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6)); + if (!Scale) + return nullptr; + + SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(), + MVT::Other); + + SDLoc DL(Node); + + // Memory Operands: Base, Scale, Index, Disp, Segment + SDValue Disp = CurDAG->getTargetConstant(0, DL, MVT::i32); + SDValue Segment = CurDAG->getRegister(0, MVT::i32); + const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue(), DL), VIdx, + Disp, Segment, VMask, Chain}; + SDNode *ResNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops); + // Node has 2 outputs: VDst and MVT::Other. + // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other. + // We replace VDst of Node with VDst of ResNode, and Other of Node with Other + // of ResNode. + ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0)); + ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2)); + return ResNode; +} + +SDNode *X86DAGToDAGISel::Select(SDNode *Node) { + MVT NVT = Node->getSimpleValueType(0); + unsigned Opc, MOpc; + unsigned Opcode = Node->getOpcode(); + SDLoc dl(Node); + + DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n'); + + if (Node->isMachineOpcode()) { + DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n'); + Node->setNodeId(-1); + return nullptr; // Already selected. + } + + switch (Opcode) { + default: break; + case ISD::BRIND: { + if (Subtarget->isTargetNaCl()) + // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We + // leave the instruction alone. + break; + if (Subtarget->isTarget64BitILP32()) { + // Converts a 32-bit register to a 64-bit, zero-extended version of + // it. This is needed because x86-64 can do many things, but jmp %r32 + // ain't one of them. + const SDValue &Target = Node->getOperand(1); + assert(Target.getSimpleValueType() == llvm::MVT::i32); + SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64)); + SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other, + Node->getOperand(0), ZextTarget); + ReplaceUses(SDValue(Node, 0), Brind); + SelectCode(ZextTarget.getNode()); + SelectCode(Brind.getNode()); + return nullptr; + } + break; + } + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: break; + case Intrinsic::x86_avx2_gather_d_pd: + case Intrinsic::x86_avx2_gather_d_pd_256: + case Intrinsic::x86_avx2_gather_q_pd: + case Intrinsic::x86_avx2_gather_q_pd_256: + case Intrinsic::x86_avx2_gather_d_ps: + case Intrinsic::x86_avx2_gather_d_ps_256: + case Intrinsic::x86_avx2_gather_q_ps: + case Intrinsic::x86_avx2_gather_q_ps_256: + case Intrinsic::x86_avx2_gather_d_q: + case Intrinsic::x86_avx2_gather_d_q_256: + case Intrinsic::x86_avx2_gather_q_q: + case Intrinsic::x86_avx2_gather_q_q_256: + case Intrinsic::x86_avx2_gather_d_d: + case Intrinsic::x86_avx2_gather_d_d_256: + case Intrinsic::x86_avx2_gather_q_d: + case Intrinsic::x86_avx2_gather_q_d_256: { + if (!Subtarget->hasAVX2()) + break; + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); + case Intrinsic::x86_avx2_gather_d_pd: Opc = X86::VGATHERDPDrm; break; + case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break; + case Intrinsic::x86_avx2_gather_q_pd: Opc = X86::VGATHERQPDrm; break; + case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break; + case Intrinsic::x86_avx2_gather_d_ps: Opc = X86::VGATHERDPSrm; break; + case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break; + case Intrinsic::x86_avx2_gather_q_ps: Opc = X86::VGATHERQPSrm; break; + case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break; + case Intrinsic::x86_avx2_gather_d_q: Opc = X86::VPGATHERDQrm; break; + case Intrinsic::x86_avx2_gather_d_q_256: Opc = X86::VPGATHERDQYrm; break; + case Intrinsic::x86_avx2_gather_q_q: Opc = X86::VPGATHERQQrm; break; + case Intrinsic::x86_avx2_gather_q_q_256: Opc = X86::VPGATHERQQYrm; break; + case Intrinsic::x86_avx2_gather_d_d: Opc = X86::VPGATHERDDrm; break; + case Intrinsic::x86_avx2_gather_d_d_256: Opc = X86::VPGATHERDDYrm; break; + case Intrinsic::x86_avx2_gather_q_d: Opc = X86::VPGATHERQDrm; break; + case Intrinsic::x86_avx2_gather_q_d_256: Opc = X86::VPGATHERQDYrm; break; + } + SDNode *RetVal = selectGather(Node, Opc); + if (RetVal) + // We already called ReplaceUses inside SelectGather. + return nullptr; + break; + } + } + break; + } + case X86ISD::GlobalBaseReg: + return getGlobalBaseReg(); + + case X86ISD::SHRUNKBLEND: { + // SHRUNKBLEND selects like a regular VSELECT. + SDValue VSelect = CurDAG->getNode( + ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0), + Node->getOperand(1), Node->getOperand(2)); + ReplaceUses(SDValue(Node, 0), VSelect); + SelectCode(VSelect.getNode()); + // We already called ReplaceUses. + return nullptr; + } + + case ISD::ATOMIC_LOAD_XOR: + case ISD::ATOMIC_LOAD_AND: + case ISD::ATOMIC_LOAD_OR: + case ISD::ATOMIC_LOAD_ADD: { + SDNode *RetVal = selectAtomicLoadArith(Node, NVT); + if (RetVal) + return RetVal; + break; + } + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + // For operations of the form (x << C1) op C2, check if we can use a smaller + // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse()) + break; + + // i8 is unshrinkable, i16 should be promoted to i32. + if (NVT != MVT::i32 && NVT != MVT::i64) + break; + + ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1); + ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1)); + if (!Cst || !ShlCst) + break; + + int64_t Val = Cst->getSExtValue(); + uint64_t ShlVal = ShlCst->getZExtValue(); + + // Make sure that we don't change the operation by removing bits. + // This only matters for OR and XOR, AND is unaffected. + uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1; + if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) + break; + + unsigned ShlOp, AddOp, Op; + MVT CstVT = NVT; + + // Check the minimum bitwidth for the new constant. + // TODO: AND32ri is the same as AND64ri32 with zext imm. + // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr + // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. + if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal)) + CstVT = MVT::i8; + else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal)) + CstVT = MVT::i32; + + // Bail if there is no smaller encoding. + if (NVT == CstVT) + break; + + switch (NVT.SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i32: + assert(CstVT == MVT::i8); + ShlOp = X86::SHL32ri; + AddOp = X86::ADD32rr; + + switch (Opcode) { + default: llvm_unreachable("Impossible opcode"); + case ISD::AND: Op = X86::AND32ri8; break; + case ISD::OR: Op = X86::OR32ri8; break; + case ISD::XOR: Op = X86::XOR32ri8; break; + } + break; + case MVT::i64: + assert(CstVT == MVT::i8 || CstVT == MVT::i32); + ShlOp = X86::SHL64ri; + AddOp = X86::ADD64rr; + + switch (Opcode) { + default: llvm_unreachable("Impossible opcode"); + case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break; + case ISD::OR: Op = CstVT==MVT::i8? X86::OR64ri8 : X86::OR64ri32; break; + case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break; + } + break; + } + + // Emit the smaller op and the shift. + SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, dl, CstVT); + SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst); + if (ShlVal == 1) + return CurDAG->SelectNodeTo(Node, AddOp, NVT, SDValue(New, 0), + SDValue(New, 0)); + return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0), + getI8Imm(ShlVal, dl)); + } + case X86ISD::UMUL8: + case X86ISD::SMUL8: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + Opc = (Opcode == X86ISD::SMUL8 ? X86::IMUL8r : X86::MUL8r); + + SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::AL, + N0, SDValue()).getValue(1); + + SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32); + SDValue Ops[] = {N1, InFlag}; + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + + ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); + ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1)); + return nullptr; + } + + case X86ISD::UMUL: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + unsigned LoReg; + switch (NVT.SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: LoReg = X86::AL; Opc = X86::MUL8r; break; + case MVT::i16: LoReg = X86::AX; Opc = X86::MUL16r; break; + case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break; + case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break; + } + + SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, + N0, SDValue()).getValue(1); + + SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::i32); + SDValue Ops[] = {N1, InFlag}; + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + + ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); + ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1)); + ReplaceUses(SDValue(Node, 2), SDValue(CNode, 2)); + return nullptr; + } + + case ISD::SMUL_LOHI: + case ISD::UMUL_LOHI: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + bool isSigned = Opcode == ISD::SMUL_LOHI; + bool hasBMI2 = Subtarget->hasBMI2(); + if (!isSigned) { + switch (NVT.SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: Opc = X86::MUL8r; MOpc = X86::MUL8m; break; + case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break; + case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r; + MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break; + case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r; + MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break; + } + } else { + switch (NVT.SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: Opc = X86::IMUL8r; MOpc = X86::IMUL8m; break; + case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break; + case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break; + case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break; + } + } + + unsigned SrcReg, LoReg, HiReg; + switch (Opc) { + default: llvm_unreachable("Unknown MUL opcode!"); + case X86::IMUL8r: + case X86::MUL8r: + SrcReg = LoReg = X86::AL; HiReg = X86::AH; + break; + case X86::IMUL16r: + case X86::MUL16r: + SrcReg = LoReg = X86::AX; HiReg = X86::DX; + break; + case X86::IMUL32r: + case X86::MUL32r: + SrcReg = LoReg = X86::EAX; HiReg = X86::EDX; + break; + case X86::IMUL64r: + case X86::MUL64r: + SrcReg = LoReg = X86::RAX; HiReg = X86::RDX; + break; + case X86::MULX32rr: + SrcReg = X86::EDX; LoReg = HiReg = 0; + break; + case X86::MULX64rr: + SrcReg = X86::RDX; LoReg = HiReg = 0; + break; + } + + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + // Multiply is commmutative. + if (!foldedLoad) { + foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + if (foldedLoad) + std::swap(N0, N1); + } + + SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg, + N0, SDValue()).getValue(1); + SDValue ResHi, ResLo; + + if (foldedLoad) { + SDValue Chain; + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), + InFlag }; + if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) { + SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue); + SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + ResHi = SDValue(CNode, 0); + ResLo = SDValue(CNode, 1); + Chain = SDValue(CNode, 2); + InFlag = SDValue(CNode, 3); + } else { + SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue); + SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + Chain = SDValue(CNode, 0); + InFlag = SDValue(CNode, 1); + } + + // Update the chain. + ReplaceUses(N1.getValue(1), Chain); + } else { + SDValue Ops[] = { N1, InFlag }; + if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) { + SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue); + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + ResHi = SDValue(CNode, 0); + ResLo = SDValue(CNode, 1); + InFlag = SDValue(CNode, 2); + } else { + SDVTList VTs = CurDAG->getVTList(MVT::Glue); + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + InFlag = SDValue(CNode, 0); + } + } + + // Prevent use of AH in a REX instruction by referencing AX instead. + if (HiReg == X86::AH && Subtarget->is64Bit() && + !SDValue(Node, 1).use_empty()) { + SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + X86::AX, MVT::i16, InFlag); + InFlag = Result.getValue(2); + // Get the low part if needed. Don't use getCopyFromReg for aliasing + // registers. + if (!SDValue(Node, 0).use_empty()) + ReplaceUses(SDValue(Node, 1), + CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); + + // Shift AX down 8 bits. + Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16, + Result, + CurDAG->getTargetConstant(8, dl, MVT::i8)), + 0); + // Then truncate it down to i8. + ReplaceUses(SDValue(Node, 1), + CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); + } + // Copy the low half of the result, if it is needed. + if (!SDValue(Node, 0).use_empty()) { + if (!ResLo.getNode()) { + assert(LoReg && "Register for low half is not defined!"); + ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT, + InFlag); + InFlag = ResLo.getValue(2); + } + ReplaceUses(SDValue(Node, 0), ResLo); + DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n'); + } + // Copy the high half of the result, if it is needed. + if (!SDValue(Node, 1).use_empty()) { + if (!ResHi.getNode()) { + assert(HiReg && "Register for high half is not defined!"); + ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT, + InFlag); + InFlag = ResHi.getValue(2); + } + ReplaceUses(SDValue(Node, 1), ResHi); + DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n'); + } + + return nullptr; + } + + case ISD::SDIVREM: + case ISD::UDIVREM: + case X86ISD::SDIVREM8_SEXT_HREG: + case X86ISD::UDIVREM8_ZEXT_HREG: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + bool isSigned = (Opcode == ISD::SDIVREM || + Opcode == X86ISD::SDIVREM8_SEXT_HREG); + if (!isSigned) { + switch (NVT.SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break; + case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break; + case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break; + case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break; + } + } else { + switch (NVT.SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break; + case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break; + case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break; + case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break; + } + } + + unsigned LoReg, HiReg, ClrReg; + unsigned SExtOpcode; + switch (NVT.SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: + LoReg = X86::AL; ClrReg = HiReg = X86::AH; + SExtOpcode = X86::CBW; + break; + case MVT::i16: + LoReg = X86::AX; HiReg = X86::DX; + ClrReg = X86::DX; + SExtOpcode = X86::CWD; + break; + case MVT::i32: + LoReg = X86::EAX; ClrReg = HiReg = X86::EDX; + SExtOpcode = X86::CDQ; + break; + case MVT::i64: + LoReg = X86::RAX; ClrReg = HiReg = X86::RDX; + SExtOpcode = X86::CQO; + break; + } + + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + bool signBitIsZero = CurDAG->SignBitIsZero(N0); + + SDValue InFlag; + if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) { + // Special case for div8, just use a move with zero extension to AX to + // clear the upper 8 bits (AH). + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain; + if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; + Move = + SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32, + MVT::Other, Ops), 0); + Chain = Move.getValue(1); + ReplaceUses(N0.getValue(1), Chain); + } else { + Move = + SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0); + Chain = CurDAG->getEntryNode(); + } + Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue()); + InFlag = Chain.getValue(1); + } else { + InFlag = + CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, + LoReg, N0, SDValue()).getValue(1); + if (isSigned && !signBitIsZero) { + // Sign extend the low part into the high part. + InFlag = + SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0); + } else { + // Zero out the high part, effectively zero extending the input. + SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0); + switch (NVT.SimpleTy) { + case MVT::i16: + ClrNode = + SDValue(CurDAG->getMachineNode( + TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode, + CurDAG->getTargetConstant(X86::sub_16bit, dl, + MVT::i32)), + 0); + break; + case MVT::i32: + break; + case MVT::i64: + ClrNode = + SDValue(CurDAG->getMachineNode( + TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, + CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode, + CurDAG->getTargetConstant(X86::sub_32bit, dl, + MVT::i32)), + 0); + break; + default: + llvm_unreachable("Unexpected division source"); + } + + InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg, + ClrNode, InFlag).getValue(1); + } + } + + if (foldedLoad) { + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), + InFlag }; + SDNode *CNode = + CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops); + InFlag = SDValue(CNode, 1); + // Update the chain. + ReplaceUses(N1.getValue(1), SDValue(CNode, 0)); + } else { + InFlag = + SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0); + } + + // Prevent use of AH in a REX instruction by explicitly copying it to + // an ABCD_L register. + // + // The current assumption of the register allocator is that isel + // won't generate explicit references to the GR8_ABCD_H registers. If + // the allocator and/or the backend get enhanced to be more robust in + // that regard, this can be, and should be, removed. + if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) { + SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8); + unsigned AHExtOpcode = + isSigned ? X86::MOVSX32_NOREXrr8 : X86::MOVZX32_NOREXrr8; + + SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32, + MVT::Glue, AHCopy, InFlag); + SDValue Result(RNode, 0); + InFlag = SDValue(RNode, 1); + + if (Opcode == X86ISD::UDIVREM8_ZEXT_HREG || + Opcode == X86ISD::SDIVREM8_SEXT_HREG) { + if (Node->getValueType(1) == MVT::i64) { + // It's not possible to directly movsx AH to a 64bit register, because + // the latter needs the REX prefix, but the former can't have it. + assert(Opcode != X86ISD::SDIVREM8_SEXT_HREG && + "Unexpected i64 sext of h-register"); + Result = + SDValue(CurDAG->getMachineNode( + TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, + CurDAG->getTargetConstant(0, dl, MVT::i64), Result, + CurDAG->getTargetConstant(X86::sub_32bit, dl, + MVT::i32)), + 0); + } + } else { + Result = + CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result); + } + ReplaceUses(SDValue(Node, 1), Result); + DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); + } + // Copy the division (low) result, if it is needed. + if (!SDValue(Node, 0).use_empty()) { + SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + LoReg, NVT, InFlag); + InFlag = Result.getValue(2); + ReplaceUses(SDValue(Node, 0), Result); + DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); + } + // Copy the remainder (high) result, if it is needed. + if (!SDValue(Node, 1).use_empty()) { + SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + HiReg, NVT, InFlag); + InFlag = Result.getValue(2); + ReplaceUses(SDValue(Node, 1), Result); + DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); + } + return nullptr; + } + + case X86ISD::CMP: + case X86ISD::SUB: { + // Sometimes a SUB is used to perform comparison. + if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0)) + // This node is not a CMP. + break; + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && + hasNoSignedComparisonUses(Node)) + N0 = N0.getOperand(0); + + // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to + // use a smaller encoding. + // Look past the truncate if CMP is the only use of it. + if ((N0.getNode()->getOpcode() == ISD::AND || + (N0.getResNo() == 0 && N0.getNode()->getOpcode() == X86ISD::AND)) && + N0.getNode()->hasOneUse() && + N0.getValueType() != MVT::i8 && + X86::isZeroNode(N1)) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1)); + if (!C) break; + + // For example, convert "testl %eax, $8" to "testb %al, $8" + if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 && + (!(C->getZExtValue() & 0x80) || + hasNoSignedComparisonUses(Node))) { + SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8); + SDValue Reg = N0.getNode()->getOperand(0); + + // On x86-32, only the ABCD registers have 8-bit subregisters. + if (!Subtarget->is64Bit()) { + const TargetRegisterClass *TRC; + switch (N0.getSimpleValueType().SimpleTy) { + case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break; + case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break; + default: llvm_unreachable("Unsupported TEST operand type!"); + } + SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32); + Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl, + Reg.getValueType(), Reg, RC), 0); + } + + // Extract the l-register. + SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, + MVT::i8, Reg); + + // Emit a testb. + SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, + Subreg, Imm); + // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has + // one, do not call ReplaceAllUsesWith. + ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), + SDValue(NewNode, 0)); + return nullptr; + } + + // For example, "testl %eax, $2048" to "testb %ah, $8". + if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 && + (!(C->getZExtValue() & 0x8000) || + hasNoSignedComparisonUses(Node))) { + // Shift the immediate right by 8 bits. + SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8, + dl, MVT::i8); + SDValue Reg = N0.getNode()->getOperand(0); + + // Put the value in an ABCD register. + const TargetRegisterClass *TRC; + switch (N0.getSimpleValueType().SimpleTy) { + case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break; + case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break; + case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break; + default: llvm_unreachable("Unsupported TEST operand type!"); + } + SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32); + Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl, + Reg.getValueType(), Reg, RC), 0); + + // Extract the h-register. + SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, + MVT::i8, Reg); + + // Emit a testb. The EXTRACT_SUBREG becomes a COPY that can only + // target GR8_NOREX registers, so make sure the register class is + // forced. + SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl, + MVT::i32, Subreg, ShiftedImm); + // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has + // one, do not call ReplaceAllUsesWith. + ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), + SDValue(NewNode, 0)); + return nullptr; + } + + // For example, "testl %eax, $32776" to "testw %ax, $32776". + if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 && + N0.getValueType() != MVT::i16 && + (!(C->getZExtValue() & 0x8000) || + hasNoSignedComparisonUses(Node))) { + SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, + MVT::i16); + SDValue Reg = N0.getNode()->getOperand(0); + + // Extract the 16-bit subregister. + SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, + MVT::i16, Reg); + + // Emit a testw. + SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32, + Subreg, Imm); + // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has + // one, do not call ReplaceAllUsesWith. + ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), + SDValue(NewNode, 0)); + return nullptr; + } + + // For example, "testq %rax, $268468232" to "testl %eax, $268468232". + if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 && + N0.getValueType() == MVT::i64 && + (!(C->getZExtValue() & 0x80000000) || + hasNoSignedComparisonUses(Node))) { + SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, + MVT::i32); + SDValue Reg = N0.getNode()->getOperand(0); + + // Extract the 32-bit subregister. + SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl, + MVT::i32, Reg); + + // Emit a testl. + SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32, + Subreg, Imm); + // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has + // one, do not call ReplaceAllUsesWith. + ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), + SDValue(NewNode, 0)); + return nullptr; + } + } + break; + } + case ISD::STORE: { + // Change a chain of {load; incr or dec; store} of the same value into + // a simple increment or decrement through memory of that value, if the + // uses of the modified value and its address are suitable. + // The DEC64m tablegen pattern is currently not able to match the case where + // the EFLAGS on the original DEC are used. (This also applies to + // {INC,DEC}X{64,32,16,8}.) + // We'll need to improve tablegen to allow flags to be transferred from a + // node in the pattern to the result node. probably with a new keyword + // for example, we have this + // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", + // [(store (add (loadi64 addr:$dst), -1), addr:$dst), + // (implicit EFLAGS)]>; + // but maybe need something like this + // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", + // [(store (add (loadi64 addr:$dst), -1), addr:$dst), + // (transferrable EFLAGS)]>; + + StoreSDNode *StoreNode = cast<StoreSDNode>(Node); + SDValue StoredVal = StoreNode->getOperand(1); + unsigned Opc = StoredVal->getOpcode(); + + LoadSDNode *LoadNode = nullptr; + SDValue InputChain; + if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG, + LoadNode, InputChain)) + break; + + SDValue Base, Scale, Index, Disp, Segment; + if (!selectAddr(LoadNode, LoadNode->getBasePtr(), + Base, Scale, Index, Disp, Segment)) + break; + + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2); + MemOp[0] = StoreNode->getMemOperand(); + MemOp[1] = LoadNode->getMemOperand(); + const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain }; + EVT LdVT = LoadNode->getMemoryVT(); + unsigned newOpc = getFusedLdStOpcode(LdVT, Opc); + MachineSDNode *Result = CurDAG->getMachineNode(newOpc, + SDLoc(Node), + MVT::i32, MVT::Other, Ops); + Result->setMemRefs(MemOp, MemOp + 2); + + ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); + ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); + + return Result; + } + } + + SDNode *ResNode = SelectCode(Node); + + DEBUG(dbgs() << "=> "; + if (ResNode == nullptr || ResNode == Node) + Node->dump(CurDAG); + else + ResNode->dump(CurDAG); + dbgs() << '\n'); + + return ResNode; +} + +bool X86DAGToDAGISel:: +SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, + std::vector<SDValue> &OutOps) { + SDValue Op0, Op1, Op2, Op3, Op4; + switch (ConstraintID) { + default: + llvm_unreachable("Unexpected asm memory constraint"); + case InlineAsm::Constraint_i: + // FIXME: It seems strange that 'i' is needed here since it's supposed to + // be an immediate and not a memory constraint. + // Fallthrough. + case InlineAsm::Constraint_o: // offsetable ?? + case InlineAsm::Constraint_v: // not offsetable ?? + case InlineAsm::Constraint_m: // memory + case InlineAsm::Constraint_X: + if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4)) + return true; + break; + } + + OutOps.push_back(Op0); + OutOps.push_back(Op1); + OutOps.push_back(Op2); + OutOps.push_back(Op3); + OutOps.push_back(Op4); + return false; +} + +/// This pass converts a legalized DAG into a X86-specific DAG, +/// ready for instruction scheduling. +FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, + CodeGenOpt::Level OptLevel) { + return new X86DAGToDAGISel(TM, OptLevel); +} diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp new file mode 100644 index 0000000..d31aab0 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -0,0 +1,28765 @@ +//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that X86 uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#include "X86ISelLowering.h" +#include "Utils/X86ShuffleDecode.h" +#include "X86CallingConv.h" +#include "X86FrameLowering.h" +#include "X86InstrBuilder.h" +#include "X86MachineFunctionInfo.h" +#include "X86ShuffleDecodeConstantPool.h" +#include "X86TargetMachine.h" +#include "X86TargetObjectFile.h" +#include "llvm/ADT/SmallBitVector.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Analysis/EHPersonalities.h" +#include "llvm/CodeGen/IntrinsicLowering.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/WinEHFuncInfo.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetOptions.h" +#include "X86IntrinsicsInfo.h" +#include <bitset> +#include <numeric> +#include <cctype> +using namespace llvm; + +#define DEBUG_TYPE "x86-isel" + +STATISTIC(NumTailCalls, "Number of tail calls"); + +static cl::opt<bool> ExperimentalVectorWideningLegalization( + "x86-experimental-vector-widening-legalization", cl::init(false), + cl::desc("Enable an experimental vector type legalization through widening " + "rather than promotion."), + cl::Hidden); + +X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, + const X86Subtarget &STI) + : TargetLowering(TM), Subtarget(&STI) { + X86ScalarSSEf64 = Subtarget->hasSSE2(); + X86ScalarSSEf32 = Subtarget->hasSSE1(); + MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize()); + + // Set up the TargetLowering object. + + // X86 is weird. It always uses i8 for shift amounts and setcc results. + setBooleanContents(ZeroOrOneBooleanContent); + // X86-SSE is even stranger. It uses -1 or 0 for vector masks. + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + + // For 64-bit, since we have so many registers, use the ILP scheduler. + // For 32-bit, use the register pressure specific scheduling. + // For Atom, always use ILP scheduling. + if (Subtarget->isAtom()) + setSchedulingPreference(Sched::ILP); + else if (Subtarget->is64Bit()) + setSchedulingPreference(Sched::ILP); + else + setSchedulingPreference(Sched::RegPressure); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); + + // Bypass expensive divides on Atom when compiling with O2. + if (TM.getOptLevel() >= CodeGenOpt::Default) { + if (Subtarget->hasSlowDivide32()) + addBypassSlowDiv(32, 8); + if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit()) + addBypassSlowDiv(64, 16); + } + + if (Subtarget->isTargetKnownWindowsMSVC()) { + // Setup Windows compiler runtime calls. + setLibcallName(RTLIB::SDIV_I64, "_alldiv"); + setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); + setLibcallName(RTLIB::SREM_I64, "_allrem"); + setLibcallName(RTLIB::UREM_I64, "_aullrem"); + setLibcallName(RTLIB::MUL_I64, "_allmul"); + setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); + setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); + setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); + setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); + setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); + } + + if (Subtarget->isTargetDarwin()) { + // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. + setUseUnderscoreSetJmp(false); + setUseUnderscoreLongJmp(false); + } else if (Subtarget->isTargetWindowsGNU()) { + // MS runtime is weird: it exports _setjmp, but longjmp! + setUseUnderscoreSetJmp(true); + setUseUnderscoreLongJmp(false); + } else { + setUseUnderscoreSetJmp(true); + setUseUnderscoreLongJmp(true); + } + + // Set up the register classes. + addRegisterClass(MVT::i8, &X86::GR8RegClass); + addRegisterClass(MVT::i16, &X86::GR16RegClass); + addRegisterClass(MVT::i32, &X86::GR32RegClass); + if (Subtarget->is64Bit()) + addRegisterClass(MVT::i64, &X86::GR64RegClass); + + for (MVT VT : MVT::integer_valuetypes()) + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + + // We don't accept any truncstore of integer registers. + setTruncStoreAction(MVT::i64, MVT::i32, Expand); + setTruncStoreAction(MVT::i64, MVT::i16, Expand); + setTruncStoreAction(MVT::i64, MVT::i8 , Expand); + setTruncStoreAction(MVT::i32, MVT::i16, Expand); + setTruncStoreAction(MVT::i32, MVT::i8 , Expand); + setTruncStoreAction(MVT::i16, MVT::i8, Expand); + + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + + // SETOEQ and SETUNE require checking two conditions. + setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); + setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); + setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); + setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); + setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); + setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); + + // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this + // operation. + setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); + setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); + setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); + + if (Subtarget->is64Bit()) { + if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) + // f32/f64 are legal, f80 is custom. + setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); + else + setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); + setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); + } else if (!Subtarget->useSoftFloat()) { + // We have an algorithm for SSE2->double, and we turn this into a + // 64-bit FILD followed by conditional FADD for other targets. + setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); + // We have an algorithm for SSE2, and we turn this into a 64-bit + // FILD or VCVTUSI2SS/SD for other targets. + setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); + } + + // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have + // this operation. + setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); + setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); + + if (!Subtarget->useSoftFloat()) { + // SSE has no i16 to fp conversion, only i32 + if (X86ScalarSSEf32) { + setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); + // f32 and f64 cases are Legal, f80 case is not + setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); + } else { + setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); + setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); + } + } else { + setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); + setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); + } + + // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have + // this operation. + setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); + setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); + + if (!Subtarget->useSoftFloat()) { + // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 + // are Legal, f80 is custom lowered. + setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); + setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); + + if (X86ScalarSSEf32) { + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); + // f32 and f64 cases are Legal, f80 case is not + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + } else { + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + } + } else { + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand); + setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand); + } + + // Handle FP_TO_UINT by promoting the destination to a larger signed + // conversion. + setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); + setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); + setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); + + if (Subtarget->is64Bit()) { + if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) { + // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80. + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); + setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); + } else { + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); + setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); + } + } else if (!Subtarget->useSoftFloat()) { + // Since AVX is a superset of SSE3, only check for SSE here. + if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) + // Expand FP_TO_UINT into a select. + // FIXME: We would like to use a Custom expander here eventually to do + // the optimal thing for SSE vs. the default expansion in the legalizer. + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); + else + // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom. + // With SSE3 we can use fisttpll to convert to a signed i64; without + // SSE, we're stuck with a fistpll. + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); + + setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); + } + + // TODO: when we have SSE, these could be more efficient, by using movd/movq. + if (!X86ScalarSSEf64) { + setOperationAction(ISD::BITCAST , MVT::f32 , Expand); + setOperationAction(ISD::BITCAST , MVT::i32 , Expand); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::BITCAST , MVT::f64 , Expand); + // Without SSE, i64->f64 goes through memory. + setOperationAction(ISD::BITCAST , MVT::i64 , Expand); + } + } + + // Scalar integer divide and remainder are lowered to use operations that + // produce two results, to match the available instructions. This exposes + // the two-result form to trivial CSE, which is able to combine x/y and x%y + // into a single instruction. + // + // Scalar integer multiply-high is also lowered to use two-result + // operations, to match the available instructions. However, plain multiply + // (low) operations are left as Legal, as there are single-result + // instructions for this in x86. Using the two-result multiply instructions + // when both high and low results are needed must be arranged by dagcombine. + for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { + setOperationAction(ISD::MULHS, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + + // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. + setOperationAction(ISD::ADDC, VT, Custom); + setOperationAction(ISD::ADDE, VT, Custom); + setOperationAction(ISD::SUBC, VT, Custom); + setOperationAction(ISD::SUBE, VT, Custom); + } + + setOperationAction(ISD::BR_JT , MVT::Other, Expand); + setOperationAction(ISD::BRCOND , MVT::Other, Custom); + setOperationAction(ISD::BR_CC , MVT::f32, Expand); + setOperationAction(ISD::BR_CC , MVT::f64, Expand); + setOperationAction(ISD::BR_CC , MVT::f80, Expand); + setOperationAction(ISD::BR_CC , MVT::f128, Expand); + setOperationAction(ISD::BR_CC , MVT::i8, Expand); + setOperationAction(ISD::BR_CC , MVT::i16, Expand); + setOperationAction(ISD::BR_CC , MVT::i32, Expand); + setOperationAction(ISD::BR_CC , MVT::i64, Expand); + setOperationAction(ISD::SELECT_CC , MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC , MVT::f64, Expand); + setOperationAction(ISD::SELECT_CC , MVT::f80, Expand); + setOperationAction(ISD::SELECT_CC , MVT::f128, Expand); + setOperationAction(ISD::SELECT_CC , MVT::i8, Expand); + setOperationAction(ISD::SELECT_CC , MVT::i16, Expand); + setOperationAction(ISD::SELECT_CC , MVT::i32, Expand); + setOperationAction(ISD::SELECT_CC , MVT::i64, Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); + setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); + + if (Subtarget->is32Bit() && Subtarget->isTargetKnownWindowsMSVC()) { + // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` + // is. We should promote the value to 64-bits to solve this. + // This is what the CRT headers do - `fmodf` is an inline header + // function casting to f64 and calling `fmod`. + setOperationAction(ISD::FREM , MVT::f32 , Promote); + } else { + setOperationAction(ISD::FREM , MVT::f32 , Expand); + } + + setOperationAction(ISD::FREM , MVT::f64 , Expand); + setOperationAction(ISD::FREM , MVT::f80 , Expand); + setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); + + // Promote the i8 variants and force them on up to i32 which has a shorter + // encoding. + setOperationAction(ISD::CTTZ , MVT::i8 , Promote); + AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); + setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); + AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); + if (Subtarget->hasBMI()) { + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); + } else { + setOperationAction(ISD::CTTZ , MVT::i16 , Custom); + setOperationAction(ISD::CTTZ , MVT::i32 , Custom); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTTZ , MVT::i64 , Custom); + } + + if (Subtarget->hasLZCNT()) { + // When promoting the i8 variants, force them to i32 for a shorter + // encoding. + setOperationAction(ISD::CTLZ , MVT::i8 , Promote); + AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); + AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); + } else { + setOperationAction(ISD::CTLZ , MVT::i8 , Custom); + setOperationAction(ISD::CTLZ , MVT::i16 , Custom); + setOperationAction(ISD::CTLZ , MVT::i32 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::CTLZ , MVT::i64 , Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); + } + } + + // Special handling for half-precision floating point conversions. + // If we don't have F16C support, then lower half float conversions + // into library calls. + if (Subtarget->useSoftFloat() || !Subtarget->hasF16C()) { + setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); + } + + // There's never any support for operations beyond MVT::f32. + setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand); + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); + setTruncStoreAction(MVT::f80, MVT::f16, Expand); + + if (Subtarget->hasPOPCNT()) { + setOperationAction(ISD::CTPOP , MVT::i8 , Promote); + } else { + setOperationAction(ISD::CTPOP , MVT::i8 , Expand); + setOperationAction(ISD::CTPOP , MVT::i16 , Expand); + setOperationAction(ISD::CTPOP , MVT::i32 , Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTPOP , MVT::i64 , Expand); + } + + setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); + + if (!Subtarget->hasMOVBE()) + setOperationAction(ISD::BSWAP , MVT::i16 , Expand); + + // These should be promoted to a larger select which is supported. + setOperationAction(ISD::SELECT , MVT::i1 , Promote); + // X86 wants to expand cmov itself. + setOperationAction(ISD::SELECT , MVT::i8 , Custom); + setOperationAction(ISD::SELECT , MVT::i16 , Custom); + setOperationAction(ISD::SELECT , MVT::i32 , Custom); + setOperationAction(ISD::SELECT , MVT::f32 , Custom); + setOperationAction(ISD::SELECT , MVT::f64 , Custom); + setOperationAction(ISD::SELECT , MVT::f80 , Custom); + setOperationAction(ISD::SELECT , MVT::f128 , Custom); + setOperationAction(ISD::SETCC , MVT::i8 , Custom); + setOperationAction(ISD::SETCC , MVT::i16 , Custom); + setOperationAction(ISD::SETCC , MVT::i32 , Custom); + setOperationAction(ISD::SETCC , MVT::f32 , Custom); + setOperationAction(ISD::SETCC , MVT::f64 , Custom); + setOperationAction(ISD::SETCC , MVT::f80 , Custom); + setOperationAction(ISD::SETCC , MVT::f128 , Custom); + setOperationAction(ISD::SETCCE , MVT::i8 , Custom); + setOperationAction(ISD::SETCCE , MVT::i16 , Custom); + setOperationAction(ISD::SETCCE , MVT::i32 , Custom); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::SELECT , MVT::i64 , Custom); + setOperationAction(ISD::SETCC , MVT::i64 , Custom); + setOperationAction(ISD::SETCCE , MVT::i64 , Custom); + } + setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); + // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support + // SjLj exception handling but a light-weight setjmp/longjmp replacement to + // support continuation, user-level threading, and etc.. As a result, no + // other SjLj exception interfaces are implemented and please don't build + // your own exception handling based on them. + // LLVM/Clang supports zero-cost DWARF exception handling. + setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); + setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + + // Darwin ABI issue. + setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); + setOperationAction(ISD::JumpTable , MVT::i32 , Custom); + setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); + if (Subtarget->is64Bit()) + setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); + setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); + setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); + setOperationAction(ISD::JumpTable , MVT::i64 , Custom); + setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); + setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); + setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); + } + // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) + setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); + setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); + setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); + setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); + setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); + } + + if (Subtarget->hasSSE1()) + setOperationAction(ISD::PREFETCH , MVT::Other, Legal); + + setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); + + // Expand certain atomics + for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); + setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); + setOperationAction(ISD::ATOMIC_STORE, VT, Custom); + } + + if (Subtarget->hasCmpxchg16b()) { + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); + } + + // FIXME - use subtarget debug flags + if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() && + !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) { + setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); + } + + setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); + setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); + + setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); + setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); + + setOperationAction(ISD::TRAP, MVT::Other, Legal); + setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); + + // VASTART needs to be custom lowered to use the VarArgsFrameIndex + setOperationAction(ISD::VASTART , MVT::Other, Custom); + setOperationAction(ISD::VAEND , MVT::Other, Expand); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::VAARG , MVT::Other, Custom); + setOperationAction(ISD::VACOPY , MVT::Other, Custom); + } else { + // TargetInfo::CharPtrBuiltinVaList + setOperationAction(ISD::VAARG , MVT::Other, Expand); + setOperationAction(ISD::VACOPY , MVT::Other, Expand); + } + + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + + setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); + + // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering. + setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); + setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom); + + if (!Subtarget->useSoftFloat() && X86ScalarSSEf64) { + // f32 and f64 use SSE. + // Set up the FP register classes. + addRegisterClass(MVT::f32, &X86::FR32RegClass); + addRegisterClass(MVT::f64, &X86::FR64RegClass); + + // Use ANDPD to simulate FABS. + setOperationAction(ISD::FABS , MVT::f64, Custom); + setOperationAction(ISD::FABS , MVT::f32, Custom); + + // Use XORP to simulate FNEG. + setOperationAction(ISD::FNEG , MVT::f64, Custom); + setOperationAction(ISD::FNEG , MVT::f32, Custom); + + // Use ANDPD and ORPD to simulate FCOPYSIGN. + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + + // Lower this to FGETSIGNx86 plus an AND. + setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); + setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); + + // We don't support sin/cos/fmod + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); + + // Expand FP immediates into loads from the stack, except for the special + // cases we handle. + addLegalFPImmediate(APFloat(+0.0)); // xorpd + addLegalFPImmediate(APFloat(+0.0f)); // xorps + } else if (!Subtarget->useSoftFloat() && X86ScalarSSEf32) { + // Use SSE for f32, x87 for f64. + // Set up the FP register classes. + addRegisterClass(MVT::f32, &X86::FR32RegClass); + addRegisterClass(MVT::f64, &X86::RFP64RegClass); + + // Use ANDPS to simulate FABS. + setOperationAction(ISD::FABS , MVT::f32, Custom); + + // Use XORP to simulate FNEG. + setOperationAction(ISD::FNEG , MVT::f32, Custom); + + setOperationAction(ISD::UNDEF, MVT::f64, Expand); + + // Use ANDPS and ORPS to simulate FCOPYSIGN. + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + + // We don't support sin/cos/fmod + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); + + // Special cases we handle for FP constants. + addLegalFPImmediate(APFloat(+0.0f)); // xorps + addLegalFPImmediate(APFloat(+0.0)); // FLD0 + addLegalFPImmediate(APFloat(+1.0)); // FLD1 + addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS + addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS + + if (!TM.Options.UnsafeFPMath) { + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + } + } else if (!Subtarget->useSoftFloat()) { + // f32 and f64 in x87. + // Set up the FP register classes. + addRegisterClass(MVT::f64, &X86::RFP64RegClass); + addRegisterClass(MVT::f32, &X86::RFP32RegClass); + + setOperationAction(ISD::UNDEF, MVT::f64, Expand); + setOperationAction(ISD::UNDEF, MVT::f32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + + if (!TM.Options.UnsafeFPMath) { + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); + } + addLegalFPImmediate(APFloat(+0.0)); // FLD0 + addLegalFPImmediate(APFloat(+1.0)); // FLD1 + addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS + addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS + addLegalFPImmediate(APFloat(+0.0f)); // FLD0 + addLegalFPImmediate(APFloat(+1.0f)); // FLD1 + addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS + addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS + } + + // We don't support FMA. + setOperationAction(ISD::FMA, MVT::f64, Expand); + setOperationAction(ISD::FMA, MVT::f32, Expand); + + // Long double always uses X87, except f128 in MMX. + if (!Subtarget->useSoftFloat()) { + if (Subtarget->is64Bit() && Subtarget->hasMMX()) { + addRegisterClass(MVT::f128, &X86::FR128RegClass); + ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); + setOperationAction(ISD::FABS , MVT::f128, Custom); + setOperationAction(ISD::FNEG , MVT::f128, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); + } + + addRegisterClass(MVT::f80, &X86::RFP80RegClass); + setOperationAction(ISD::UNDEF, MVT::f80, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); + { + APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); + addLegalFPImmediate(TmpFlt); // FLD0 + TmpFlt.changeSign(); + addLegalFPImmediate(TmpFlt); // FLD0/FCHS + + bool ignored; + APFloat TmpFlt2(+1.0); + TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, + &ignored); + addLegalFPImmediate(TmpFlt2); // FLD1 + TmpFlt2.changeSign(); + addLegalFPImmediate(TmpFlt2); // FLD1/FCHS + } + + if (!TM.Options.UnsafeFPMath) { + setOperationAction(ISD::FSIN , MVT::f80, Expand); + setOperationAction(ISD::FCOS , MVT::f80, Expand); + setOperationAction(ISD::FSINCOS, MVT::f80, Expand); + } + + setOperationAction(ISD::FFLOOR, MVT::f80, Expand); + setOperationAction(ISD::FCEIL, MVT::f80, Expand); + setOperationAction(ISD::FTRUNC, MVT::f80, Expand); + setOperationAction(ISD::FRINT, MVT::f80, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); + setOperationAction(ISD::FMA, MVT::f80, Expand); + } + + // Always use a library call for pow. + setOperationAction(ISD::FPOW , MVT::f32 , Expand); + setOperationAction(ISD::FPOW , MVT::f64 , Expand); + setOperationAction(ISD::FPOW , MVT::f80 , Expand); + + setOperationAction(ISD::FLOG, MVT::f80, Expand); + setOperationAction(ISD::FLOG2, MVT::f80, Expand); + setOperationAction(ISD::FLOG10, MVT::f80, Expand); + setOperationAction(ISD::FEXP, MVT::f80, Expand); + setOperationAction(ISD::FEXP2, MVT::f80, Expand); + setOperationAction(ISD::FMINNUM, MVT::f80, Expand); + setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); + + // First set operation action for all vector types to either promote + // (for widening) or expand (for scalarization). Then we will selectively + // turn on ones that can be effectively codegen'd. + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::ADD , VT, Expand); + setOperationAction(ISD::SUB , VT, Expand); + setOperationAction(ISD::FADD, VT, Expand); + setOperationAction(ISD::FNEG, VT, Expand); + setOperationAction(ISD::FSUB, VT, Expand); + setOperationAction(ISD::MUL , VT, Expand); + setOperationAction(ISD::FMUL, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::LOAD, VT, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); + setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); + setOperationAction(ISD::FABS, VT, Expand); + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FSINCOS, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FSINCOS, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::FMA, VT, Expand); + setOperationAction(ISD::FPOWI, VT, Expand); + setOperationAction(ISD::FSQRT, VT, Expand); + setOperationAction(ISD::FCOPYSIGN, VT, Expand); + setOperationAction(ISD::FFLOOR, VT, Expand); + setOperationAction(ISD::FCEIL, VT, Expand); + setOperationAction(ISD::FTRUNC, VT, Expand); + setOperationAction(ISD::FRINT, VT, Expand); + setOperationAction(ISD::FNEARBYINT, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::MULHS, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::CTPOP, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::SHL, VT, Expand); + setOperationAction(ISD::SRA, VT, Expand); + setOperationAction(ISD::SRL, VT, Expand); + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); + setOperationAction(ISD::BSWAP, VT, Expand); + setOperationAction(ISD::SETCC, VT, Expand); + setOperationAction(ISD::FLOG, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); + setOperationAction(ISD::FLOG10, VT, Expand); + setOperationAction(ISD::FEXP, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); + setOperationAction(ISD::FP_TO_UINT, VT, Expand); + setOperationAction(ISD::FP_TO_SINT, VT, Expand); + setOperationAction(ISD::UINT_TO_FP, VT, Expand); + setOperationAction(ISD::SINT_TO_FP, VT, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); + setOperationAction(ISD::TRUNCATE, VT, Expand); + setOperationAction(ISD::SIGN_EXTEND, VT, Expand); + setOperationAction(ISD::ZERO_EXTEND, VT, Expand); + setOperationAction(ISD::ANY_EXTEND, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + for (MVT InnerVT : MVT::vector_valuetypes()) { + setTruncStoreAction(InnerVT, VT, Expand); + + setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand); + + // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like + // types, we have to deal with them whether we ask for Expansion or not. + // Setting Expand causes its own optimisation problems though, so leave + // them legal. + if (VT.getVectorElementType() == MVT::i1) + setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); + + // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are + // split/scalarized right now. + if (VT.getVectorElementType() == MVT::f16) + setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); + } + } + + // FIXME: In order to prevent SSE instructions being expanded to MMX ones + // with -msoft-float, disable use of MMX as well. + if (!Subtarget->useSoftFloat() && Subtarget->hasMMX()) { + addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); + // No operations on x86mmx supported, everything uses intrinsics. + } + + // MMX-sized vectors (other than x86mmx) are expected to be expanded + // into smaller operations. + for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) { + setOperationAction(ISD::MULHS, MMXTy, Expand); + setOperationAction(ISD::AND, MMXTy, Expand); + setOperationAction(ISD::OR, MMXTy, Expand); + setOperationAction(ISD::XOR, MMXTy, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, MMXTy, Expand); + setOperationAction(ISD::SELECT, MMXTy, Expand); + setOperationAction(ISD::BITCAST, MMXTy, Expand); + } + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); + + if (!Subtarget->useSoftFloat() && Subtarget->hasSSE1()) { + addRegisterClass(MVT::v4f32, &X86::VR128RegClass); + + setOperationAction(ISD::FADD, MVT::v4f32, Legal); + setOperationAction(ISD::FSUB, MVT::v4f32, Legal); + setOperationAction(ISD::FMUL, MVT::v4f32, Legal); + setOperationAction(ISD::FDIV, MVT::v4f32, Legal); + setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); + setOperationAction(ISD::FNEG, MVT::v4f32, Custom); + setOperationAction(ISD::FABS, MVT::v4f32, Custom); + setOperationAction(ISD::LOAD, MVT::v4f32, Legal); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); + setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); + setOperationAction(ISD::SELECT, MVT::v4f32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); + } + + if (!Subtarget->useSoftFloat() && Subtarget->hasSSE2()) { + addRegisterClass(MVT::v2f64, &X86::VR128RegClass); + + // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM + // registers cannot be used even for integer operations. + addRegisterClass(MVT::v16i8, &X86::VR128RegClass); + addRegisterClass(MVT::v8i16, &X86::VR128RegClass); + addRegisterClass(MVT::v4i32, &X86::VR128RegClass); + addRegisterClass(MVT::v2i64, &X86::VR128RegClass); + + setOperationAction(ISD::ADD, MVT::v16i8, Legal); + setOperationAction(ISD::ADD, MVT::v8i16, Legal); + setOperationAction(ISD::ADD, MVT::v4i32, Legal); + setOperationAction(ISD::ADD, MVT::v2i64, Legal); + setOperationAction(ISD::MUL, MVT::v16i8, Custom); + setOperationAction(ISD::MUL, MVT::v4i32, Custom); + setOperationAction(ISD::MUL, MVT::v2i64, Custom); + setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom); + setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom); + setOperationAction(ISD::MULHU, MVT::v8i16, Legal); + setOperationAction(ISD::MULHS, MVT::v8i16, Legal); + setOperationAction(ISD::SUB, MVT::v16i8, Legal); + setOperationAction(ISD::SUB, MVT::v8i16, Legal); + setOperationAction(ISD::SUB, MVT::v4i32, Legal); + setOperationAction(ISD::SUB, MVT::v2i64, Legal); + setOperationAction(ISD::MUL, MVT::v8i16, Legal); + setOperationAction(ISD::FADD, MVT::v2f64, Legal); + setOperationAction(ISD::FSUB, MVT::v2f64, Legal); + setOperationAction(ISD::FMUL, MVT::v2f64, Legal); + setOperationAction(ISD::FDIV, MVT::v2f64, Legal); + setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); + setOperationAction(ISD::FNEG, MVT::v2f64, Custom); + setOperationAction(ISD::FABS, MVT::v2f64, Custom); + + setOperationAction(ISD::SMAX, MVT::v8i16, Legal); + setOperationAction(ISD::UMAX, MVT::v16i8, Legal); + setOperationAction(ISD::SMIN, MVT::v8i16, Legal); + setOperationAction(ISD::UMIN, MVT::v16i8, Legal); + + setOperationAction(ISD::SETCC, MVT::v2i64, Custom); + setOperationAction(ISD::SETCC, MVT::v16i8, Custom); + setOperationAction(ISD::SETCC, MVT::v8i16, Custom); + setOperationAction(ISD::SETCC, MVT::v4i32, Custom); + + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + + setOperationAction(ISD::CTPOP, MVT::v16i8, Custom); + setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); + setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); + + setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); + setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); + setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); + // ISD::CTTZ v2i64 - scalarization is faster. + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); + // ISD::CTTZ_ZERO_UNDEF v2i64 - scalarization is faster. + + // Custom lower build_vector, vector_shuffle, and extract_vector_elt. + for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + } + + // We support custom legalizing of sext and anyext loads for specific + // memory vector types which we can load as a scalar (or sequence of + // scalars) and extend in-register to a legal 128-bit vector type. For sext + // loads these must work with a single scalar load. + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom); + } + + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); + setOperationAction(ISD::VSELECT, MVT::v2f64, Custom); + setOperationAction(ISD::VSELECT, MVT::v2i64, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); + + if (Subtarget->is64Bit()) { + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); + } + + // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. + for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, MVT::v2i64); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, MVT::v2i64); + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, MVT::v2i64); + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType (ISD::LOAD, VT, MVT::v2i64); + setOperationAction(ISD::SELECT, VT, Promote); + AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); + } + + // Custom lower v2i64 and v2f64 selects. + setOperationAction(ISD::LOAD, MVT::v2f64, Legal); + setOperationAction(ISD::LOAD, MVT::v2i64, Legal); + setOperationAction(ISD::SELECT, MVT::v2f64, Custom); + setOperationAction(ISD::SELECT, MVT::v2i64, Custom); + + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + + setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); + + setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); + // As there is no 64-bit GPR available, we need build a special custom + // sequence to convert from v2i32 to v2f32. + if (!Subtarget->is64Bit()) + setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); + + setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); + + for (MVT VT : MVT::fp_vector_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal); + + setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); + setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); + setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); + } + + if (!Subtarget->useSoftFloat() && Subtarget->hasSSE41()) { + for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { + setOperationAction(ISD::FFLOOR, RoundedTy, Legal); + setOperationAction(ISD::FCEIL, RoundedTy, Legal); + setOperationAction(ISD::FTRUNC, RoundedTy, Legal); + setOperationAction(ISD::FRINT, RoundedTy, Legal); + setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); + } + + setOperationAction(ISD::SMAX, MVT::v16i8, Legal); + setOperationAction(ISD::SMAX, MVT::v4i32, Legal); + setOperationAction(ISD::UMAX, MVT::v8i16, Legal); + setOperationAction(ISD::UMAX, MVT::v4i32, Legal); + setOperationAction(ISD::SMIN, MVT::v16i8, Legal); + setOperationAction(ISD::SMIN, MVT::v4i32, Legal); + setOperationAction(ISD::UMIN, MVT::v8i16, Legal); + setOperationAction(ISD::UMIN, MVT::v4i32, Legal); + + // FIXME: Do we need to handle scalar-to-vector here? + setOperationAction(ISD::MUL, MVT::v4i32, Legal); + + // We directly match byte blends in the backend as they match the VSELECT + // condition form. + setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); + + // SSE41 brings specific instructions for doing vector sign extend even in + // cases where we don't have SRA. + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); + } + + // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); + + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); + + // i8 and i16 vectors are custom because the source register and source + // source memory operand types are not the same width. f32 vectors are + // custom since the immediate controlling the insert encodes additional + // information. + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); + + // FIXME: these should be Legal, but that's only for the case where + // the index is constant. For now custom expand to deal with that. + if (Subtarget->is64Bit()) { + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); + } + } + + if (Subtarget->hasSSE2()) { + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); + + setOperationAction(ISD::SRL, MVT::v8i16, Custom); + setOperationAction(ISD::SRL, MVT::v16i8, Custom); + + setOperationAction(ISD::SHL, MVT::v8i16, Custom); + setOperationAction(ISD::SHL, MVT::v16i8, Custom); + + setOperationAction(ISD::SRA, MVT::v8i16, Custom); + setOperationAction(ISD::SRA, MVT::v16i8, Custom); + + // In the customized shift lowering, the legal cases in AVX2 will be + // recognized. + setOperationAction(ISD::SRL, MVT::v2i64, Custom); + setOperationAction(ISD::SRL, MVT::v4i32, Custom); + + setOperationAction(ISD::SHL, MVT::v2i64, Custom); + setOperationAction(ISD::SHL, MVT::v4i32, Custom); + + setOperationAction(ISD::SRA, MVT::v2i64, Custom); + setOperationAction(ISD::SRA, MVT::v4i32, Custom); + } + + if (Subtarget->hasXOP()) { + setOperationAction(ISD::ROTL, MVT::v16i8, Custom); + setOperationAction(ISD::ROTL, MVT::v8i16, Custom); + setOperationAction(ISD::ROTL, MVT::v4i32, Custom); + setOperationAction(ISD::ROTL, MVT::v2i64, Custom); + setOperationAction(ISD::ROTL, MVT::v32i8, Custom); + setOperationAction(ISD::ROTL, MVT::v16i16, Custom); + setOperationAction(ISD::ROTL, MVT::v8i32, Custom); + setOperationAction(ISD::ROTL, MVT::v4i64, Custom); + } + + if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) { + addRegisterClass(MVT::v32i8, &X86::VR256RegClass); + addRegisterClass(MVT::v16i16, &X86::VR256RegClass); + addRegisterClass(MVT::v8i32, &X86::VR256RegClass); + addRegisterClass(MVT::v8f32, &X86::VR256RegClass); + addRegisterClass(MVT::v4i64, &X86::VR256RegClass); + addRegisterClass(MVT::v4f64, &X86::VR256RegClass); + + setOperationAction(ISD::LOAD, MVT::v8f32, Legal); + setOperationAction(ISD::LOAD, MVT::v4f64, Legal); + setOperationAction(ISD::LOAD, MVT::v4i64, Legal); + + setOperationAction(ISD::FADD, MVT::v8f32, Legal); + setOperationAction(ISD::FSUB, MVT::v8f32, Legal); + setOperationAction(ISD::FMUL, MVT::v8f32, Legal); + setOperationAction(ISD::FDIV, MVT::v8f32, Legal); + setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); + setOperationAction(ISD::FCEIL, MVT::v8f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal); + setOperationAction(ISD::FRINT, MVT::v8f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal); + setOperationAction(ISD::FNEG, MVT::v8f32, Custom); + setOperationAction(ISD::FABS, MVT::v8f32, Custom); + + setOperationAction(ISD::FADD, MVT::v4f64, Legal); + setOperationAction(ISD::FSUB, MVT::v4f64, Legal); + setOperationAction(ISD::FMUL, MVT::v4f64, Legal); + setOperationAction(ISD::FDIV, MVT::v4f64, Legal); + setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); + setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); + setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); + setOperationAction(ISD::FRINT, MVT::v4f64, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal); + setOperationAction(ISD::FNEG, MVT::v4f64, Custom); + setOperationAction(ISD::FABS, MVT::v4f64, Custom); + + // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted + // even though v8i16 is a legal type. + setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + + setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); + + setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); + + for (MVT VT : MVT::fp_vector_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal); + + setOperationAction(ISD::SRL, MVT::v16i16, Custom); + setOperationAction(ISD::SRL, MVT::v32i8, Custom); + + setOperationAction(ISD::SHL, MVT::v16i16, Custom); + setOperationAction(ISD::SHL, MVT::v32i8, Custom); + + setOperationAction(ISD::SRA, MVT::v16i16, Custom); + setOperationAction(ISD::SRA, MVT::v32i8, Custom); + + setOperationAction(ISD::SETCC, MVT::v32i8, Custom); + setOperationAction(ISD::SETCC, MVT::v16i16, Custom); + setOperationAction(ISD::SETCC, MVT::v8i32, Custom); + setOperationAction(ISD::SETCC, MVT::v4i64, Custom); + + setOperationAction(ISD::SELECT, MVT::v4f64, Custom); + setOperationAction(ISD::SELECT, MVT::v4i64, Custom); + setOperationAction(ISD::SELECT, MVT::v8f32, Custom); + + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); + + setOperationAction(ISD::CTPOP, MVT::v32i8, Custom); + setOperationAction(ISD::CTPOP, MVT::v16i16, Custom); + setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); + + setOperationAction(ISD::CTTZ, MVT::v32i8, Custom); + setOperationAction(ISD::CTTZ, MVT::v16i16, Custom); + setOperationAction(ISD::CTTZ, MVT::v8i32, Custom); + setOperationAction(ISD::CTTZ, MVT::v4i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v32i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); + + if (Subtarget->hasAnyFMA()) { + setOperationAction(ISD::FMA, MVT::v8f32, Legal); + setOperationAction(ISD::FMA, MVT::v4f64, Legal); + setOperationAction(ISD::FMA, MVT::v4f32, Legal); + setOperationAction(ISD::FMA, MVT::v2f64, Legal); + setOperationAction(ISD::FMA, MVT::f32, Legal); + setOperationAction(ISD::FMA, MVT::f64, Legal); + } + + if (Subtarget->hasInt256()) { + setOperationAction(ISD::ADD, MVT::v4i64, Legal); + setOperationAction(ISD::ADD, MVT::v8i32, Legal); + setOperationAction(ISD::ADD, MVT::v16i16, Legal); + setOperationAction(ISD::ADD, MVT::v32i8, Legal); + + setOperationAction(ISD::SUB, MVT::v4i64, Legal); + setOperationAction(ISD::SUB, MVT::v8i32, Legal); + setOperationAction(ISD::SUB, MVT::v16i16, Legal); + setOperationAction(ISD::SUB, MVT::v32i8, Legal); + + setOperationAction(ISD::MUL, MVT::v4i64, Custom); + setOperationAction(ISD::MUL, MVT::v8i32, Legal); + setOperationAction(ISD::MUL, MVT::v16i16, Legal); + setOperationAction(ISD::MUL, MVT::v32i8, Custom); + + setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom); + setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom); + setOperationAction(ISD::MULHU, MVT::v16i16, Legal); + setOperationAction(ISD::MULHS, MVT::v16i16, Legal); + + setOperationAction(ISD::SMAX, MVT::v32i8, Legal); + setOperationAction(ISD::SMAX, MVT::v16i16, Legal); + setOperationAction(ISD::SMAX, MVT::v8i32, Legal); + setOperationAction(ISD::UMAX, MVT::v32i8, Legal); + setOperationAction(ISD::UMAX, MVT::v16i16, Legal); + setOperationAction(ISD::UMAX, MVT::v8i32, Legal); + setOperationAction(ISD::SMIN, MVT::v32i8, Legal); + setOperationAction(ISD::SMIN, MVT::v16i16, Legal); + setOperationAction(ISD::SMIN, MVT::v8i32, Legal); + setOperationAction(ISD::UMIN, MVT::v32i8, Legal); + setOperationAction(ISD::UMIN, MVT::v16i16, Legal); + setOperationAction(ISD::UMIN, MVT::v8i32, Legal); + + // The custom lowering for UINT_TO_FP for v8i32 becomes interesting + // when we have a 256bit-wide blend with immediate. + setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); + + // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X + setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); + + setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); + } else { + setOperationAction(ISD::ADD, MVT::v4i64, Custom); + setOperationAction(ISD::ADD, MVT::v8i32, Custom); + setOperationAction(ISD::ADD, MVT::v16i16, Custom); + setOperationAction(ISD::ADD, MVT::v32i8, Custom); + + setOperationAction(ISD::SUB, MVT::v4i64, Custom); + setOperationAction(ISD::SUB, MVT::v8i32, Custom); + setOperationAction(ISD::SUB, MVT::v16i16, Custom); + setOperationAction(ISD::SUB, MVT::v32i8, Custom); + + setOperationAction(ISD::MUL, MVT::v4i64, Custom); + setOperationAction(ISD::MUL, MVT::v8i32, Custom); + setOperationAction(ISD::MUL, MVT::v16i16, Custom); + setOperationAction(ISD::MUL, MVT::v32i8, Custom); + + setOperationAction(ISD::SMAX, MVT::v32i8, Custom); + setOperationAction(ISD::SMAX, MVT::v16i16, Custom); + setOperationAction(ISD::SMAX, MVT::v8i32, Custom); + setOperationAction(ISD::UMAX, MVT::v32i8, Custom); + setOperationAction(ISD::UMAX, MVT::v16i16, Custom); + setOperationAction(ISD::UMAX, MVT::v8i32, Custom); + setOperationAction(ISD::SMIN, MVT::v32i8, Custom); + setOperationAction(ISD::SMIN, MVT::v16i16, Custom); + setOperationAction(ISD::SMIN, MVT::v8i32, Custom); + setOperationAction(ISD::UMIN, MVT::v32i8, Custom); + setOperationAction(ISD::UMIN, MVT::v16i16, Custom); + setOperationAction(ISD::UMIN, MVT::v8i32, Custom); + } + + // In the customized shift lowering, the legal cases in AVX2 will be + // recognized. + setOperationAction(ISD::SRL, MVT::v4i64, Custom); + setOperationAction(ISD::SRL, MVT::v8i32, Custom); + + setOperationAction(ISD::SHL, MVT::v4i64, Custom); + setOperationAction(ISD::SHL, MVT::v8i32, Custom); + + setOperationAction(ISD::SRA, MVT::v4i64, Custom); + setOperationAction(ISD::SRA, MVT::v8i32, Custom); + + // Custom lower several nodes for 256-bit types. + for (MVT VT : MVT::vector_valuetypes()) { + if (VT.getScalarSizeInBits() >= 32) { + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); + } + // Extract subvector is special because the value type + // (result) is 128-bit but the source is 256-bit wide. + if (VT.is128BitVector()) { + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + } + // Do not attempt to custom lower other non-256-bit vectors + if (!VT.is256BitVector()) + continue; + + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + } + + if (Subtarget->hasInt256()) + setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); + + // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. + for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, MVT::v4i64); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, MVT::v4i64); + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, MVT::v4i64); + setOperationAction(ISD::LOAD, VT, Promote); + AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); + setOperationAction(ISD::SELECT, VT, Promote); + AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); + } + } + + if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) { + addRegisterClass(MVT::v16i32, &X86::VR512RegClass); + addRegisterClass(MVT::v16f32, &X86::VR512RegClass); + addRegisterClass(MVT::v8i64, &X86::VR512RegClass); + addRegisterClass(MVT::v8f64, &X86::VR512RegClass); + + addRegisterClass(MVT::i1, &X86::VK1RegClass); + addRegisterClass(MVT::v8i1, &X86::VK8RegClass); + addRegisterClass(MVT::v16i1, &X86::VK16RegClass); + + for (MVT VT : MVT::fp_vector_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); + + setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v32i16, MVT::v32i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v32i16, MVT::v32i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i32, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i32, Legal); + + setOperationAction(ISD::BR_CC, MVT::i1, Expand); + setOperationAction(ISD::SETCC, MVT::i1, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); + setOperationAction(ISD::XOR, MVT::i1, Legal); + setOperationAction(ISD::OR, MVT::i1, Legal); + setOperationAction(ISD::AND, MVT::i1, Legal); + setOperationAction(ISD::SUB, MVT::i1, Custom); + setOperationAction(ISD::ADD, MVT::i1, Custom); + setOperationAction(ISD::MUL, MVT::i1, Custom); + setOperationAction(ISD::LOAD, MVT::v16f32, Legal); + setOperationAction(ISD::LOAD, MVT::v8f64, Legal); + setOperationAction(ISD::LOAD, MVT::v8i64, Legal); + setOperationAction(ISD::LOAD, MVT::v16i32, Legal); + setOperationAction(ISD::LOAD, MVT::v16i1, Legal); + + setOperationAction(ISD::FADD, MVT::v16f32, Legal); + setOperationAction(ISD::FSUB, MVT::v16f32, Legal); + setOperationAction(ISD::FMUL, MVT::v16f32, Legal); + setOperationAction(ISD::FDIV, MVT::v16f32, Legal); + setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); + setOperationAction(ISD::FNEG, MVT::v16f32, Custom); + setOperationAction(ISD::FABS, MVT::v16f32, Custom); + + setOperationAction(ISD::FADD, MVT::v8f64, Legal); + setOperationAction(ISD::FSUB, MVT::v8f64, Legal); + setOperationAction(ISD::FMUL, MVT::v8f64, Legal); + setOperationAction(ISD::FDIV, MVT::v8f64, Legal); + setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); + setOperationAction(ISD::FNEG, MVT::v8f64, Custom); + setOperationAction(ISD::FABS, MVT::v8f64, Custom); + setOperationAction(ISD::FMA, MVT::v8f64, Legal); + setOperationAction(ISD::FMA, MVT::v16f32, Legal); + + setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom); + setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); + + setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); + setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); + if (Subtarget->hasVLX()){ + setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); + setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); + setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); + + setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); + setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); + } else { + setOperationAction(ISD::MLOAD, MVT::v8i32, Custom); + setOperationAction(ISD::MLOAD, MVT::v8f32, Custom); + setOperationAction(ISD::MSTORE, MVT::v8i32, Custom); + setOperationAction(ISD::MSTORE, MVT::v8f32, Custom); + } + setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom); + if (Subtarget->hasDQI()) { + setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); + + setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); + if (Subtarget->hasVLX()) { + setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); + } + } + if (Subtarget->hasVLX()) { + setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); + } + setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); + if (Subtarget->hasDQI()) { + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom); + } + setOperationAction(ISD::FFLOOR, MVT::v16f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v8f64, Legal); + setOperationAction(ISD::FCEIL, MVT::v16f32, Legal); + setOperationAction(ISD::FCEIL, MVT::v8f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::v16f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::v8f64, Legal); + setOperationAction(ISD::FRINT, MVT::v16f32, Legal); + setOperationAction(ISD::FRINT, MVT::v8f64, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v16f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v8f64, Legal); + + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom); + + setOperationAction(ISD::SETCC, MVT::v16i1, Custom); + setOperationAction(ISD::SETCC, MVT::v8i1, Custom); + + setOperationAction(ISD::MUL, MVT::v8i64, Custom); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); + setOperationAction(ISD::SELECT, MVT::v8f64, Custom); + setOperationAction(ISD::SELECT, MVT::v8i64, Custom); + setOperationAction(ISD::SELECT, MVT::v16f32, Custom); + setOperationAction(ISD::SELECT, MVT::v16i1, Custom); + setOperationAction(ISD::SELECT, MVT::v8i1, Custom); + + setOperationAction(ISD::SMAX, MVT::v16i32, Legal); + setOperationAction(ISD::SMAX, MVT::v8i64, Legal); + setOperationAction(ISD::UMAX, MVT::v16i32, Legal); + setOperationAction(ISD::UMAX, MVT::v8i64, Legal); + setOperationAction(ISD::SMIN, MVT::v16i32, Legal); + setOperationAction(ISD::SMIN, MVT::v8i64, Legal); + setOperationAction(ISD::UMIN, MVT::v16i32, Legal); + setOperationAction(ISD::UMIN, MVT::v8i64, Legal); + + setOperationAction(ISD::ADD, MVT::v8i64, Legal); + setOperationAction(ISD::ADD, MVT::v16i32, Legal); + + setOperationAction(ISD::SUB, MVT::v8i64, Legal); + setOperationAction(ISD::SUB, MVT::v16i32, Legal); + + setOperationAction(ISD::MUL, MVT::v16i32, Legal); + + setOperationAction(ISD::SRL, MVT::v8i64, Custom); + setOperationAction(ISD::SRL, MVT::v16i32, Custom); + + setOperationAction(ISD::SHL, MVT::v8i64, Custom); + setOperationAction(ISD::SHL, MVT::v16i32, Custom); + + setOperationAction(ISD::SRA, MVT::v8i64, Custom); + setOperationAction(ISD::SRA, MVT::v16i32, Custom); + + setOperationAction(ISD::AND, MVT::v8i64, Legal); + setOperationAction(ISD::OR, MVT::v8i64, Legal); + setOperationAction(ISD::XOR, MVT::v8i64, Legal); + setOperationAction(ISD::AND, MVT::v16i32, Legal); + setOperationAction(ISD::OR, MVT::v16i32, Legal); + setOperationAction(ISD::XOR, MVT::v16i32, Legal); + + if (Subtarget->hasCDI()) { + setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i32, Expand); + + setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); + setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); + setOperationAction(ISD::CTLZ, MVT::v16i16, Custom); + setOperationAction(ISD::CTLZ, MVT::v32i8, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i16, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i8, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i16, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i8, Expand); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom); + + if (Subtarget->hasVLX()) { + setOperationAction(ISD::CTLZ, MVT::v4i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v8i32, Legal); + setOperationAction(ISD::CTLZ, MVT::v2i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v4i32, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); + } else { + setOperationAction(ISD::CTLZ, MVT::v4i64, Custom); + setOperationAction(ISD::CTLZ, MVT::v8i32, Custom); + setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); + setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand); + } + } // Subtarget->hasCDI() + + if (Subtarget->hasDQI()) { + setOperationAction(ISD::MUL, MVT::v2i64, Legal); + setOperationAction(ISD::MUL, MVT::v4i64, Legal); + setOperationAction(ISD::MUL, MVT::v8i64, Legal); + } + // Custom lower several nodes. + for (MVT VT : MVT::vector_valuetypes()) { + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (EltSize == 1) { + setOperationAction(ISD::AND, VT, Legal); + setOperationAction(ISD::OR, VT, Legal); + setOperationAction(ISD::XOR, VT, Legal); + } + if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) { + setOperationAction(ISD::MGATHER, VT, Custom); + setOperationAction(ISD::MSCATTER, VT, Custom); + } + // Extract subvector is special because the value type + // (result) is 256/128-bit but the source is 512-bit wide. + if (VT.is128BitVector() || VT.is256BitVector()) { + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + } + if (VT.getVectorElementType() == MVT::i1) + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); + + // Do not attempt to custom lower other non-512-bit vectors + if (!VT.is512BitVector()) + continue; + + if (EltSize >= 32) { + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); + setOperationAction(ISD::MGATHER, VT, Legal); + setOperationAction(ISD::MSCATTER, VT, Custom); + } + } + for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) { + setOperationAction(ISD::SELECT, VT, Promote); + AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); + } + }// has AVX-512 + + if (!Subtarget->useSoftFloat() && Subtarget->hasBWI()) { + addRegisterClass(MVT::v32i16, &X86::VR512RegClass); + addRegisterClass(MVT::v64i8, &X86::VR512RegClass); + + addRegisterClass(MVT::v32i1, &X86::VK32RegClass); + addRegisterClass(MVT::v64i1, &X86::VK64RegClass); + + setOperationAction(ISD::LOAD, MVT::v32i16, Legal); + setOperationAction(ISD::LOAD, MVT::v64i8, Legal); + setOperationAction(ISD::SETCC, MVT::v32i1, Custom); + setOperationAction(ISD::SETCC, MVT::v64i1, Custom); + setOperationAction(ISD::ADD, MVT::v32i16, Legal); + setOperationAction(ISD::ADD, MVT::v64i8, Legal); + setOperationAction(ISD::SUB, MVT::v32i16, Legal); + setOperationAction(ISD::SUB, MVT::v64i8, Legal); + setOperationAction(ISD::MUL, MVT::v32i16, Legal); + setOperationAction(ISD::MULHS, MVT::v32i16, Legal); + setOperationAction(ISD::MULHU, MVT::v32i16, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); + setOperationAction(ISD::SELECT, MVT::v32i1, Custom); + setOperationAction(ISD::SELECT, MVT::v64i1, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); + setOperationAction(ISD::VSELECT, MVT::v32i16, Legal); + setOperationAction(ISD::VSELECT, MVT::v64i8, Legal); + setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom); + + setOperationAction(ISD::SMAX, MVT::v64i8, Legal); + setOperationAction(ISD::SMAX, MVT::v32i16, Legal); + setOperationAction(ISD::UMAX, MVT::v64i8, Legal); + setOperationAction(ISD::UMAX, MVT::v32i16, Legal); + setOperationAction(ISD::SMIN, MVT::v64i8, Legal); + setOperationAction(ISD::SMIN, MVT::v32i16, Legal); + setOperationAction(ISD::UMIN, MVT::v64i8, Legal); + setOperationAction(ISD::UMIN, MVT::v32i16, Legal); + + setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); + setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); + if (Subtarget->hasVLX()) + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); + + if (Subtarget->hasCDI()) { + setOperationAction(ISD::CTLZ, MVT::v32i16, Custom); + setOperationAction(ISD::CTLZ, MVT::v64i8, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8, Expand); + } + + for (auto VT : { MVT::v64i8, MVT::v32i16 }) { + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, MVT::v8i64); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, MVT::v8i64); + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, MVT::v8i64); + } + } + + if (!Subtarget->useSoftFloat() && Subtarget->hasVLX()) { + addRegisterClass(MVT::v4i1, &X86::VK4RegClass); + addRegisterClass(MVT::v2i1, &X86::VK2RegClass); + + setOperationAction(ISD::SETCC, MVT::v4i1, Custom); + setOperationAction(ISD::SETCC, MVT::v2i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom); + setOperationAction(ISD::SELECT, MVT::v4i1, Custom); + setOperationAction(ISD::SELECT, MVT::v2i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom); + + setOperationAction(ISD::AND, MVT::v8i32, Legal); + setOperationAction(ISD::OR, MVT::v8i32, Legal); + setOperationAction(ISD::XOR, MVT::v8i32, Legal); + setOperationAction(ISD::AND, MVT::v4i32, Legal); + setOperationAction(ISD::OR, MVT::v4i32, Legal); + setOperationAction(ISD::XOR, MVT::v4i32, Legal); + setOperationAction(ISD::SRA, MVT::v2i64, Custom); + setOperationAction(ISD::SRA, MVT::v4i64, Custom); + + setOperationAction(ISD::SMAX, MVT::v2i64, Legal); + setOperationAction(ISD::SMAX, MVT::v4i64, Legal); + setOperationAction(ISD::UMAX, MVT::v2i64, Legal); + setOperationAction(ISD::UMAX, MVT::v4i64, Legal); + setOperationAction(ISD::SMIN, MVT::v2i64, Legal); + setOperationAction(ISD::SMIN, MVT::v4i64, Legal); + setOperationAction(ISD::UMIN, MVT::v2i64, Legal); + setOperationAction(ISD::UMIN, MVT::v4i64, Legal); + } + + // We want to custom lower some of our intrinsics. + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + if (!Subtarget->is64Bit()) { + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); + } + + // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't + // handle type legalization for these operations here. + // + // FIXME: We really should do custom legalization for addition and + // subtraction on x86-32 once PR3203 is fixed. We really can't do much better + // than generic legalization for 64-bit multiplication-with-overflow, though. + for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { + if (VT == MVT::i64 && !Subtarget->is64Bit()) + continue; + // Add/Sub/Mul with overflow operations are custom lowered. + setOperationAction(ISD::SADDO, VT, Custom); + setOperationAction(ISD::UADDO, VT, Custom); + setOperationAction(ISD::SSUBO, VT, Custom); + setOperationAction(ISD::USUBO, VT, Custom); + setOperationAction(ISD::SMULO, VT, Custom); + setOperationAction(ISD::UMULO, VT, Custom); + } + + if (!Subtarget->is64Bit()) { + // These libcalls are not available in 32-bit. + setLibcallName(RTLIB::SHL_I128, nullptr); + setLibcallName(RTLIB::SRL_I128, nullptr); + setLibcallName(RTLIB::SRA_I128, nullptr); + } + + // Combine sin / cos into one node or libcall if possible. + if (Subtarget->hasSinCos()) { + setLibcallName(RTLIB::SINCOS_F32, "sincosf"); + setLibcallName(RTLIB::SINCOS_F64, "sincos"); + if (Subtarget->isTargetDarwin()) { + // For MacOSX, we don't want the normal expansion of a libcall to sincos. + // We want to issue a libcall to __sincos_stret to avoid memory traffic. + setOperationAction(ISD::FSINCOS, MVT::f64, Custom); + setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + } + } + + if (Subtarget->isTargetWin64()) { + setOperationAction(ISD::SDIV, MVT::i128, Custom); + setOperationAction(ISD::UDIV, MVT::i128, Custom); + setOperationAction(ISD::SREM, MVT::i128, Custom); + setOperationAction(ISD::UREM, MVT::i128, Custom); + setOperationAction(ISD::SDIVREM, MVT::i128, Custom); + setOperationAction(ISD::UDIVREM, MVT::i128, Custom); + } + + // We have target-specific dag combine patterns for the following nodes: + setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::BITCAST); + setTargetDAGCombine(ISD::VSELECT); + setTargetDAGCombine(ISD::SELECT); + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SRA); + setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::OR); + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::FNEG); + setTargetDAGCombine(ISD::FMA); + setTargetDAGCombine(ISD::FMINNUM); + setTargetDAGCombine(ISD::FMAXNUM); + setTargetDAGCombine(ISD::SUB); + setTargetDAGCombine(ISD::LOAD); + setTargetDAGCombine(ISD::MLOAD); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::MSTORE); + setTargetDAGCombine(ISD::TRUNCATE); + setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::ANY_EXTEND); + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); + setTargetDAGCombine(ISD::SINT_TO_FP); + setTargetDAGCombine(ISD::UINT_TO_FP); + setTargetDAGCombine(ISD::SETCC); + setTargetDAGCombine(ISD::BUILD_VECTOR); + setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::XOR); + setTargetDAGCombine(ISD::MSCATTER); + setTargetDAGCombine(ISD::MGATHER); + + computeRegisterProperties(Subtarget->getRegisterInfo()); + + MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores + MaxStoresPerMemsetOptSize = 8; + MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores + MaxStoresPerMemcpyOptSize = 4; + MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores + MaxStoresPerMemmoveOptSize = 4; + setPrefLoopAlignment(4); // 2^4 bytes. + + // A predictable cmov does not hurt on an in-order CPU. + // FIXME: Use a CPU attribute to trigger this, not a CPU model. + PredictableSelectIsExpensive = !Subtarget->isAtom(); + EnableExtLdPromotion = true; + setPrefFunctionAlignment(4); // 2^4 bytes. + + verifyIntrinsicTables(); +} + +// This has so far only been implemented for 64-bit MachO. +bool X86TargetLowering::useLoadStackGuardNode() const { + return Subtarget->isTargetMachO() && Subtarget->is64Bit(); +} + +TargetLoweringBase::LegalizeTypeAction +X86TargetLowering::getPreferredVectorAction(EVT VT) const { + if (ExperimentalVectorWideningLegalization && + VT.getVectorNumElements() != 1 && + VT.getVectorElementType().getSimpleVT() != MVT::i1) + return TypeWidenVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); +} + +EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, + EVT VT) const { + if (!VT.isVector()) + return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; + + if (VT.isSimple()) { + MVT VVT = VT.getSimpleVT(); + const unsigned NumElts = VVT.getVectorNumElements(); + const MVT EltVT = VVT.getVectorElementType(); + if (VVT.is512BitVector()) { + if (Subtarget->hasAVX512()) + if (EltVT == MVT::i32 || EltVT == MVT::i64 || + EltVT == MVT::f32 || EltVT == MVT::f64) + switch(NumElts) { + case 8: return MVT::v8i1; + case 16: return MVT::v16i1; + } + if (Subtarget->hasBWI()) + if (EltVT == MVT::i8 || EltVT == MVT::i16) + switch(NumElts) { + case 32: return MVT::v32i1; + case 64: return MVT::v64i1; + } + } + + if (VVT.is256BitVector() || VVT.is128BitVector()) { + if (Subtarget->hasVLX()) + if (EltVT == MVT::i32 || EltVT == MVT::i64 || + EltVT == MVT::f32 || EltVT == MVT::f64) + switch(NumElts) { + case 2: return MVT::v2i1; + case 4: return MVT::v4i1; + case 8: return MVT::v8i1; + } + if (Subtarget->hasBWI() && Subtarget->hasVLX()) + if (EltVT == MVT::i8 || EltVT == MVT::i16) + switch(NumElts) { + case 8: return MVT::v8i1; + case 16: return MVT::v16i1; + case 32: return MVT::v32i1; + } + } + } + + return VT.changeVectorElementTypeToInteger(); +} + +/// Helper for getByValTypeAlignment to determine +/// the desired ByVal argument alignment. +static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { + if (MaxAlign == 16) + return; + if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { + if (VTy->getBitWidth() == 128) + MaxAlign = 16; + } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { + unsigned EltAlign = 0; + getMaxByValAlign(ATy->getElementType(), EltAlign); + if (EltAlign > MaxAlign) + MaxAlign = EltAlign; + } else if (StructType *STy = dyn_cast<StructType>(Ty)) { + for (auto *EltTy : STy->elements()) { + unsigned EltAlign = 0; + getMaxByValAlign(EltTy, EltAlign); + if (EltAlign > MaxAlign) + MaxAlign = EltAlign; + if (MaxAlign == 16) + break; + } + } +} + +/// Return the desired alignment for ByVal aggregate +/// function arguments in the caller parameter area. For X86, aggregates +/// that contain SSE vectors are placed at 16-byte boundaries while the rest +/// are at 4-byte boundaries. +unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty, + const DataLayout &DL) const { + if (Subtarget->is64Bit()) { + // Max of 8 and alignment of type. + unsigned TyAlign = DL.getABITypeAlignment(Ty); + if (TyAlign > 8) + return TyAlign; + return 8; + } + + unsigned Align = 4; + if (Subtarget->hasSSE1()) + getMaxByValAlign(Ty, Align); + return Align; +} + +/// Returns the target specific optimal type for load +/// and store operations as a result of memset, memcpy, and memmove +/// lowering. If DstAlign is zero that means it's safe to destination +/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it +/// means there isn't a need to check it against alignment requirement, +/// probably because the source does not need to be loaded. If 'IsMemset' is +/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that +/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy +/// source is constant so it does not need to be loaded. +/// It returns EVT::Other if the type should be determined using generic +/// target-independent logic. +EVT +X86TargetLowering::getOptimalMemOpType(uint64_t Size, + unsigned DstAlign, unsigned SrcAlign, + bool IsMemset, bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const { + const Function *F = MF.getFunction(); + if ((!IsMemset || ZeroMemset) && + !F->hasFnAttribute(Attribute::NoImplicitFloat)) { + if (Size >= 16 && + (!Subtarget->isUnalignedMem16Slow() || + ((DstAlign == 0 || DstAlign >= 16) && + (SrcAlign == 0 || SrcAlign >= 16)))) { + if (Size >= 32) { + // FIXME: Check if unaligned 32-byte accesses are slow. + if (Subtarget->hasInt256()) + return MVT::v8i32; + if (Subtarget->hasFp256()) + return MVT::v8f32; + } + if (Subtarget->hasSSE2()) + return MVT::v4i32; + if (Subtarget->hasSSE1()) + return MVT::v4f32; + } else if (!MemcpyStrSrc && Size >= 8 && + !Subtarget->is64Bit() && + Subtarget->hasSSE2()) { + // Do not use f64 to lower memcpy if source is string constant. It's + // better to use i32 to avoid the loads. + return MVT::f64; + } + } + // This is a compromise. If we reach here, unaligned accesses may be slow on + // this target. However, creating smaller, aligned accesses could be even + // slower and would certainly be a lot more code. + if (Subtarget->is64Bit() && Size >= 8) + return MVT::i64; + return MVT::i32; +} + +bool X86TargetLowering::isSafeMemOpType(MVT VT) const { + if (VT == MVT::f32) + return X86ScalarSSEf32; + else if (VT == MVT::f64) + return X86ScalarSSEf64; + return true; +} + +bool +X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned, + unsigned, + bool *Fast) const { + if (Fast) { + switch (VT.getSizeInBits()) { + default: + // 8-byte and under are always assumed to be fast. + *Fast = true; + break; + case 128: + *Fast = !Subtarget->isUnalignedMem16Slow(); + break; + case 256: + *Fast = !Subtarget->isUnalignedMem32Slow(); + break; + // TODO: What about AVX-512 (512-bit) accesses? + } + } + // Misaligned accesses of any size are always allowed. + return true; +} + +/// Return the entry encoding for a jump table in the +/// current function. The returned value is a member of the +/// MachineJumpTableInfo::JTEntryKind enum. +unsigned X86TargetLowering::getJumpTableEncoding() const { + // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF + // symbol. + if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && + Subtarget->isPICStyleGOT()) + return MachineJumpTableInfo::EK_Custom32; + + // Otherwise, use the normal jump table encoding heuristics. + return TargetLowering::getJumpTableEncoding(); +} + +bool X86TargetLowering::useSoftFloat() const { + return Subtarget->useSoftFloat(); +} + +const MCExpr * +X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, + const MachineBasicBlock *MBB, + unsigned uid,MCContext &Ctx) const{ + assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ && + Subtarget->isPICStyleGOT()); + // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF + // entries. + return MCSymbolRefExpr::create(MBB->getSymbol(), + MCSymbolRefExpr::VK_GOTOFF, Ctx); +} + +/// Returns relocation base for the given PIC jumptable. +SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, + SelectionDAG &DAG) const { + if (!Subtarget->is64Bit()) + // This doesn't have SDLoc associated with it, but is not really the + // same as a Register. + return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), + getPointerTy(DAG.getDataLayout())); + return Table; +} + +/// This returns the relocation base for the given PIC jumptable, +/// the same as getPICJumpTableRelocBase, but as an MCExpr. +const MCExpr *X86TargetLowering:: +getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, + MCContext &Ctx) const { + // X86-64 uses RIP relative addressing based on the jump table label. + if (Subtarget->isPICStyleRIPRel()) + return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); + + // Otherwise, the reference is relative to the PIC base. + return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); +} + +std::pair<const TargetRegisterClass *, uint8_t> +X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, + MVT VT) const { + const TargetRegisterClass *RRC = nullptr; + uint8_t Cost = 1; + switch (VT.SimpleTy) { + default: + return TargetLowering::findRepresentativeClass(TRI, VT); + case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: + RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; + break; + case MVT::x86mmx: + RRC = &X86::VR64RegClass; + break; + case MVT::f32: case MVT::f64: + case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: + case MVT::v4f32: case MVT::v2f64: + case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: + case MVT::v4f64: + RRC = &X86::VR128RegClass; + break; + } + return std::make_pair(RRC, Cost); +} + +bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, + unsigned &Offset) const { + if (!Subtarget->isTargetLinux()) + return false; + + if (Subtarget->is64Bit()) { + // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: + Offset = 0x28; + if (getTargetMachine().getCodeModel() == CodeModel::Kernel) + AddressSpace = 256; + else + AddressSpace = 257; + } else { + // %gs:0x14 on i386 + Offset = 0x14; + AddressSpace = 256; + } + return true; +} + +Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { + if (!Subtarget->isTargetAndroid()) + return TargetLowering::getSafeStackPointerLocation(IRB); + + // Android provides a fixed TLS slot for the SafeStack pointer. See the + // definition of TLS_SLOT_SAFESTACK in + // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h + unsigned AddressSpace, Offset; + if (Subtarget->is64Bit()) { + // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: + Offset = 0x48; + if (getTargetMachine().getCodeModel() == CodeModel::Kernel) + AddressSpace = 256; + else + AddressSpace = 257; + } else { + // %gs:0x24 on i386 + Offset = 0x24; + AddressSpace = 256; + } + + return ConstantExpr::getIntToPtr( + ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), + Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); +} + +bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, + unsigned DestAS) const { + assert(SrcAS != DestAS && "Expected different address spaces!"); + + return SrcAS < 256 && DestAS < 256; +} + +//===----------------------------------------------------------------------===// +// Return Value Calling Convention Implementation +//===----------------------------------------------------------------------===// + +#include "X86GenCallingConv.inc" + +bool X86TargetLowering::CanLowerReturn( + CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); + return CCInfo.CheckReturn(Outs, RetCC_X86); +} + +const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { + static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; + return ScratchRegs; +} + +SDValue +X86TargetLowering::LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc dl, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + + if (CallConv == CallingConv::X86_INTR && !Outs.empty()) + report_fatal_error("X86 interrupts may not return any value"); + + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); + CCInfo.AnalyzeReturn(Outs, RetCC_X86); + + SDValue Flag; + SmallVector<SDValue, 6> RetOps; + RetOps.push_back(Chain); // Operand #0 = Chain (updated below) + // Operand #1 = Bytes To Pop + RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl, + MVT::i16)); + + // Copy the result values into the output registers. + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + SDValue ValToCopy = OutVals[i]; + EVT ValVT = ValToCopy.getValueType(); + + // Promote values to the appropriate types. + if (VA.getLocInfo() == CCValAssign::SExt) + ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); + else if (VA.getLocInfo() == CCValAssign::ZExt) + ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); + else if (VA.getLocInfo() == CCValAssign::AExt) { + if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) + ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); + else + ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); + } + else if (VA.getLocInfo() == CCValAssign::BCvt) + ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy); + + assert(VA.getLocInfo() != CCValAssign::FPExt && + "Unexpected FP-extend for return value."); + + // If this is x86-64, and we disabled SSE, we can't return FP values, + // or SSE or MMX vectors. + if ((ValVT == MVT::f32 || ValVT == MVT::f64 || + VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && + (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { + report_fatal_error("SSE register return with SSE disabled"); + } + // Likewise we can't return F64 values with SSE1 only. gcc does so, but + // llvm-gcc has never done it right and no one has noticed, so this + // should be OK for now. + if (ValVT == MVT::f64 && + (Subtarget->is64Bit() && !Subtarget->hasSSE2())) + report_fatal_error("SSE2 register return with SSE2 disabled"); + + // Returns in ST0/ST1 are handled specially: these are pushed as operands to + // the RET instruction and handled by the FP Stackifier. + if (VA.getLocReg() == X86::FP0 || + VA.getLocReg() == X86::FP1) { + // If this is a copy from an xmm register to ST(0), use an FPExtend to + // change the value to the FP stack register class. + if (isScalarFPTypeInSSEReg(VA.getValVT())) + ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); + RetOps.push_back(ValToCopy); + // Don't emit a copytoreg. + continue; + } + + // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 + // which is returned in RAX / RDX. + if (Subtarget->is64Bit()) { + if (ValVT == MVT::x86mmx) { + if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { + ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy); + ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, + ValToCopy); + // If we don't have SSE2 available, convert to v4f32 so the generated + // register is legal. + if (!Subtarget->hasSSE2()) + ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy); + } + } + } + + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + } + + // All x86 ABIs require that for returning structs by value we copy + // the sret argument into %rax/%eax (depending on ABI) for the return. + // We saved the argument into a virtual register in the entry block, + // so now we copy the value out and into %rax/%eax. + // + // Checking Function.hasStructRetAttr() here is insufficient because the IR + // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is + // false, then an sret argument may be implicitly inserted in the SelDAG. In + // either case FuncInfo->setSRetReturnReg() will have been called. + if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { + SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, + getPointerTy(MF.getDataLayout())); + + unsigned RetValReg + = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? + X86::RAX : X86::EAX; + Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); + Flag = Chain.getValue(1); + + // RAX/EAX now acts like a return value. + RetOps.push_back( + DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); + } + + RetOps[0] = Chain; // Update chain. + + // Add the flag if we have it. + if (Flag.getNode()) + RetOps.push_back(Flag); + + X86ISD::NodeType opcode = X86ISD::RET_FLAG; + if (CallConv == CallingConv::X86_INTR) + opcode = X86ISD::IRET; + return DAG.getNode(opcode, dl, MVT::Other, RetOps); +} + +bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { + if (N->getNumValues() != 1) + return false; + if (!N->hasNUsesOfValue(1, 0)) + return false; + + SDValue TCChain = Chain; + SDNode *Copy = *N->use_begin(); + if (Copy->getOpcode() == ISD::CopyToReg) { + // If the copy has a glue operand, we conservatively assume it isn't safe to + // perform a tail call. + if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) + return false; + TCChain = Copy->getOperand(0); + } else if (Copy->getOpcode() != ISD::FP_EXTEND) + return false; + + bool HasRet = false; + for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() != X86ISD::RET_FLAG) + return false; + // If we are returning more than one value, we can definitely + // not make a tail call see PR19530 + if (UI->getNumOperands() > 4) + return false; + if (UI->getNumOperands() == 4 && + UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue) + return false; + HasRet = true; + } + + if (!HasRet) + return false; + + Chain = TCChain; + return true; +} + +EVT +X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, + ISD::NodeType ExtendKind) const { + MVT ReturnMVT; + // TODO: Is this also valid on 32-bit? + if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) + ReturnMVT = MVT::i8; + else + ReturnMVT = MVT::i32; + + EVT MinVT = getRegisterType(Context, ReturnMVT); + return VT.bitsLT(MinVT) ? MinVT : VT; +} + +/// Lower the result values of a call into the +/// appropriate copies out of appropriate physical registers. +/// +SDValue +X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + + // Assign locations to each value returned by this call. + SmallVector<CCValAssign, 16> RVLocs; + bool Is64Bit = Subtarget->is64Bit(); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); + CCInfo.AnalyzeCallResult(Ins, RetCC_X86); + + // Copy all of the result registers out of their specified physreg. + for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { + CCValAssign &VA = RVLocs[i]; + EVT CopyVT = VA.getLocVT(); + + // If this is x86-64, and we disabled SSE, we can't return FP values + if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && + ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { + report_fatal_error("SSE register return with SSE disabled"); + } + + // If we prefer to use the value in xmm registers, copy it out as f80 and + // use a truncate to move it from fp stack reg to xmm reg. + bool RoundAfterCopy = false; + if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && + isScalarFPTypeInSSEReg(VA.getValVT())) { + CopyVT = MVT::f80; + RoundAfterCopy = (CopyVT != VA.getLocVT()); + } + + Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), + CopyVT, InFlag).getValue(1); + SDValue Val = Chain.getValue(0); + + if (RoundAfterCopy) + Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, + // This truncation won't change the value. + DAG.getIntPtrConstant(1, dl)); + + if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1) + Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); + + InFlag = Chain.getValue(2); + InVals.push_back(Val); + } + + return Chain; +} + +//===----------------------------------------------------------------------===// +// C & StdCall & Fast Calling Convention implementation +//===----------------------------------------------------------------------===// +// StdCall calling convention seems to be standard for many Windows' API +// routines and around. It differs from C calling convention just a little: +// callee should clean up the stack, not caller. Symbols should be also +// decorated in some fancy way :) It doesn't support any vector arguments. +// For info on fast calling convention see Fast Calling Convention (tail call) +// implementation LowerX86_32FastCCCallTo. + +/// CallIsStructReturn - Determines whether a call uses struct return +/// semantics. +enum StructReturnType { + NotStructReturn, + RegStructReturn, + StackStructReturn +}; +static StructReturnType +callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) { + if (Outs.empty()) + return NotStructReturn; + + const ISD::ArgFlagsTy &Flags = Outs[0].Flags; + if (!Flags.isSRet()) + return NotStructReturn; + if (Flags.isInReg() || IsMCU) + return RegStructReturn; + return StackStructReturn; +} + +/// Determines whether a function uses struct return semantics. +static StructReturnType +argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) { + if (Ins.empty()) + return NotStructReturn; + + const ISD::ArgFlagsTy &Flags = Ins[0].Flags; + if (!Flags.isSRet()) + return NotStructReturn; + if (Flags.isInReg() || IsMCU) + return RegStructReturn; + return StackStructReturn; +} + +/// Make a copy of an aggregate at address specified by "Src" to address +/// "Dst" with size and alignment information specified by the specific +/// parameter attribute. The copy will be passed as a byval function parameter. +static SDValue +CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, + ISD::ArgFlagsTy Flags, SelectionDAG &DAG, + SDLoc dl) { + SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); + + return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), + /*isVolatile*/false, /*AlwaysInline=*/true, + /*isTailCall*/false, + MachinePointerInfo(), MachinePointerInfo()); +} + +/// Return true if the calling convention is one that we can guarantee TCO for. +static bool canGuaranteeTCO(CallingConv::ID CC) { + return (CC == CallingConv::Fast || CC == CallingConv::GHC || + CC == CallingConv::HiPE || CC == CallingConv::HHVM); +} + +/// Return true if we might ever do TCO for calls with this calling convention. +static bool mayTailCallThisCC(CallingConv::ID CC) { + switch (CC) { + // C calling conventions: + case CallingConv::C: + case CallingConv::X86_64_Win64: + case CallingConv::X86_64_SysV: + // Callee pop conventions: + case CallingConv::X86_ThisCall: + case CallingConv::X86_StdCall: + case CallingConv::X86_VectorCall: + case CallingConv::X86_FastCall: + return true; + default: + return canGuaranteeTCO(CC); + } +} + +/// Return true if the function is being made into a tailcall target by +/// changing its ABI. +static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { + return GuaranteedTailCallOpt && canGuaranteeTCO(CC); +} + +bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { + auto Attr = + CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); + if (!CI->isTailCall() || Attr.getValueAsString() == "true") + return false; + + CallSite CS(CI); + CallingConv::ID CalleeCC = CS.getCallingConv(); + if (!mayTailCallThisCC(CalleeCC)) + return false; + + return true; +} + +SDValue +X86TargetLowering::LowerMemArgument(SDValue Chain, + CallingConv::ID CallConv, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + MachineFrameInfo *MFI, + unsigned i) const { + // Create the nodes corresponding to a load from this parameter slot. + ISD::ArgFlagsTy Flags = Ins[i].Flags; + bool AlwaysUseMutable = shouldGuaranteeTCO( + CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); + bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); + EVT ValVT; + + // If value is passed by pointer we have address passed instead of the value + // itself. + bool ExtendedInMem = VA.isExtInLoc() && + VA.getValVT().getScalarType() == MVT::i1; + + if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem) + ValVT = VA.getLocVT(); + else + ValVT = VA.getValVT(); + + // Calculate SP offset of interrupt parameter, re-arrange the slot normally + // taken by a return address. + int Offset = 0; + if (CallConv == CallingConv::X86_INTR) { + const X86Subtarget& Subtarget = + static_cast<const X86Subtarget&>(DAG.getSubtarget()); + // X86 interrupts may take one or two arguments. + // On the stack there will be no return address as in regular call. + // Offset of last argument need to be set to -4/-8 bytes. + // Where offset of the first argument out of two, should be set to 0 bytes. + Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1); + } + + // FIXME: For now, all byval parameter objects are marked mutable. This can be + // changed with more analysis. + // In case of tail call optimization mark all arguments mutable. Since they + // could be overwritten by lowering of arguments in case of a tail call. + if (Flags.isByVal()) { + unsigned Bytes = Flags.getByValSize(); + if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. + int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); + // Adjust SP offset of interrupt parameter. + if (CallConv == CallingConv::X86_INTR) { + MFI->setObjectOffset(FI, Offset); + } + return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); + } else { + int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, + VA.getLocMemOffset(), isImmutable); + // Adjust SP offset of interrupt parameter. + if (CallConv == CallingConv::X86_INTR) { + MFI->setObjectOffset(FI, Offset); + } + + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); + SDValue Val = DAG.getLoad( + ValVT, dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, + false, false, 0); + return ExtendedInMem ? + DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val; + } +} + +// FIXME: Get this from tablegen. +static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv, + const X86Subtarget *Subtarget) { + assert(Subtarget->is64Bit()); + + if (Subtarget->isCallingConvWin64(CallConv)) { + static const MCPhysReg GPR64ArgRegsWin64[] = { + X86::RCX, X86::RDX, X86::R8, X86::R9 + }; + return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); + } + + static const MCPhysReg GPR64ArgRegs64Bit[] = { + X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 + }; + return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); +} + +// FIXME: Get this from tablegen. +static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, + CallingConv::ID CallConv, + const X86Subtarget *Subtarget) { + assert(Subtarget->is64Bit()); + if (Subtarget->isCallingConvWin64(CallConv)) { + // The XMM registers which might contain var arg parameters are shadowed + // in their paired GPR. So we only need to save the GPR to their home + // slots. + // TODO: __vectorcall will change this. + return None; + } + + const Function *Fn = MF.getFunction(); + bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat); + bool isSoftFloat = Subtarget->useSoftFloat(); + assert(!(isSoftFloat && NoImplicitFloatOps) && + "SSE register cannot be used when SSE is disabled!"); + if (isSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) + // Kernel mode asks for SSE to be disabled, so there are no XMM argument + // registers. + return None; + + static const MCPhysReg XMMArgRegs64Bit[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); +} + +SDValue X86TargetLowering::LowerFormalArguments( + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + MachineFunction &MF = DAG.getMachineFunction(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); + + const Function* Fn = MF.getFunction(); + if (Fn->hasExternalLinkage() && + Subtarget->isTargetCygMing() && + Fn->getName() == "main") + FuncInfo->setForceFramePointer(true); + + MachineFrameInfo *MFI = MF.getFrameInfo(); + bool Is64Bit = Subtarget->is64Bit(); + bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); + + assert(!(isVarArg && canGuaranteeTCO(CallConv)) && + "Var args not supported with calling convention fastcc, ghc or hipe"); + + if (CallConv == CallingConv::X86_INTR) { + bool isLegal = Ins.size() == 1 || + (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) || + (!Is64Bit && Ins[1].VT == MVT::i32))); + if (!isLegal) + report_fatal_error("X86 interrupts may take one or two arguments"); + } + + // Assign locations to all of the incoming arguments. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); + + // Allocate shadow area for Win64 + if (IsWin64) + CCInfo.AllocateStack(32, 8); + + CCInfo.AnalyzeFormalArguments(Ins, CC_X86); + + unsigned LastVal = ~0U; + SDValue ArgValue; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + // TODO: If an arg is passed in two places (e.g. reg and stack), skip later + // places. + assert(VA.getValNo() != LastVal && + "Don't support value assigned to multiple locs yet"); + (void)LastVal; + LastVal = VA.getValNo(); + + if (VA.isRegLoc()) { + EVT RegVT = VA.getLocVT(); + const TargetRegisterClass *RC; + if (RegVT == MVT::i32) + RC = &X86::GR32RegClass; + else if (Is64Bit && RegVT == MVT::i64) + RC = &X86::GR64RegClass; + else if (RegVT == MVT::f32) + RC = &X86::FR32RegClass; + else if (RegVT == MVT::f64) + RC = &X86::FR64RegClass; + else if (RegVT == MVT::f128) + RC = &X86::FR128RegClass; + else if (RegVT.is512BitVector()) + RC = &X86::VR512RegClass; + else if (RegVT.is256BitVector()) + RC = &X86::VR256RegClass; + else if (RegVT.is128BitVector()) + RC = &X86::VR128RegClass; + else if (RegVT == MVT::x86mmx) + RC = &X86::VR64RegClass; + else if (RegVT == MVT::i1) + RC = &X86::VK1RegClass; + else if (RegVT == MVT::v8i1) + RC = &X86::VK8RegClass; + else if (RegVT == MVT::v16i1) + RC = &X86::VK16RegClass; + else if (RegVT == MVT::v32i1) + RC = &X86::VK32RegClass; + else if (RegVT == MVT::v64i1) + RC = &X86::VK64RegClass; + else + llvm_unreachable("Unknown argument type!"); + + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); + + // If this is an 8 or 16-bit value, it is really passed promoted to 32 + // bits. Insert an assert[sz]ext to capture this, then truncate to the + // right size. + if (VA.getLocInfo() == CCValAssign::SExt) + ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + else if (VA.getLocInfo() == CCValAssign::ZExt) + ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + else if (VA.getLocInfo() == CCValAssign::BCvt) + ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue); + + if (VA.isExtInLoc()) { + // Handle MMX values passed in XMM regs. + if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1) + ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); + else + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); + } + } else { + assert(VA.isMemLoc()); + ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); + } + + // If value is passed via pointer - do a load. + if (VA.getLocInfo() == CCValAssign::Indirect) + ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, + MachinePointerInfo(), false, false, false, 0); + + InVals.push_back(ArgValue); + } + + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + // All x86 ABIs require that for returning structs by value we copy the + // sret argument into %rax/%eax (depending on ABI) for the return. Save + // the argument into a virtual register so that we can access it from the + // return points. + if (Ins[i].Flags.isSRet()) { + unsigned Reg = FuncInfo->getSRetReturnReg(); + if (!Reg) { + MVT PtrTy = getPointerTy(DAG.getDataLayout()); + Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); + FuncInfo->setSRetReturnReg(Reg); + } + SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); + break; + } + } + + unsigned StackSize = CCInfo.getNextStackOffset(); + // Align stack specially for tail calls. + if (shouldGuaranteeTCO(CallConv, + MF.getTarget().Options.GuaranteedTailCallOpt)) + StackSize = GetAlignedArgumentStackSize(StackSize, DAG); + + // If the function takes variable number of arguments, make a frame index for + // the start of the first vararg value... for expansion of llvm.va_start. We + // can skip this if there are no va_start calls. + if (MFI->hasVAStart() && + (Is64Bit || (CallConv != CallingConv::X86_FastCall && + CallConv != CallingConv::X86_ThisCall))) { + FuncInfo->setVarArgsFrameIndex( + MFI->CreateFixedObject(1, StackSize, true)); + } + + // Figure out if XMM registers are in use. + assert(!(Subtarget->useSoftFloat() && + Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && + "SSE register cannot be used when SSE is disabled!"); + + // 64-bit calling conventions support varargs and register parameters, so we + // have to do extra work to spill them in the prologue. + if (Is64Bit && isVarArg && MFI->hasVAStart()) { + // Find the first unallocated argument registers. + ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); + ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); + unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); + assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && + "SSE register cannot be used when SSE is disabled!"); + + // Gather all the live in physical registers. + SmallVector<SDValue, 6> LiveGPRs; + SmallVector<SDValue, 8> LiveXMMRegs; + SDValue ALVal; + for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { + unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); + LiveGPRs.push_back( + DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64)); + } + if (!ArgXMMs.empty()) { + unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); + ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8); + for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { + unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); + LiveXMMRegs.push_back( + DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); + } + } + + if (IsWin64) { + // Get to the caller-allocated home save location. Add 8 to account + // for the return address. + int HomeOffset = TFI.getOffsetOfLocalArea() + 8; + FuncInfo->setRegSaveFrameIndex( + MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); + // Fixup to set vararg frame on shadow area (4 x i64). + if (NumIntRegs < 4) + FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); + } else { + // For X86-64, if there are vararg parameters that are passed via + // registers, then we must store them to their spots on the stack so + // they may be loaded by deferencing the result of va_next. + FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); + FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); + FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject( + ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); + } + + // Store the integer parameter registers. + SmallVector<SDValue, 8> MemOps; + SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), + getPointerTy(DAG.getDataLayout())); + unsigned Offset = FuncInfo->getVarArgsGPOffset(); + for (SDValue Val : LiveGPRs) { + SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + RSFIN, DAG.getIntPtrConstant(Offset, dl)); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), + FuncInfo->getRegSaveFrameIndex(), Offset), + false, false, 0); + MemOps.push_back(Store); + Offset += 8; + } + + if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { + // Now store the XMM (fp + vector) parameter registers. + SmallVector<SDValue, 12> SaveXMMOps; + SaveXMMOps.push_back(Chain); + SaveXMMOps.push_back(ALVal); + SaveXMMOps.push_back(DAG.getIntPtrConstant( + FuncInfo->getRegSaveFrameIndex(), dl)); + SaveXMMOps.push_back(DAG.getIntPtrConstant( + FuncInfo->getVarArgsFPOffset(), dl)); + SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), + LiveXMMRegs.end()); + MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, + MVT::Other, SaveXMMOps)); + } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); + } + + if (isVarArg && MFI->hasMustTailInVarArgFunc()) { + // Find the largest legal vector type. + MVT VecVT = MVT::Other; + // FIXME: Only some x86_32 calling conventions support AVX512. + if (Subtarget->hasAVX512() && + (Is64Bit || (CallConv == CallingConv::X86_VectorCall || + CallConv == CallingConv::Intel_OCL_BI))) + VecVT = MVT::v16f32; + else if (Subtarget->hasAVX()) + VecVT = MVT::v8f32; + else if (Subtarget->hasSSE2()) + VecVT = MVT::v4f32; + + // We forward some GPRs and some vector types. + SmallVector<MVT, 2> RegParmTypes; + MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32; + RegParmTypes.push_back(IntVT); + if (VecVT != MVT::Other) + RegParmTypes.push_back(VecVT); + + // Compute the set of forwarded registers. The rest are scratch. + SmallVectorImpl<ForwardedRegister> &Forwards = + FuncInfo->getForwardedMustTailRegParms(); + CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); + + // Conservatively forward AL on x86_64, since it might be used for varargs. + if (Is64Bit && !CCInfo.isAllocated(X86::AL)) { + unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass); + Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); + } + + // Copy all forwards from physical to virtual registers. + for (ForwardedRegister &F : Forwards) { + // FIXME: Can we use a less constrained schedule? + SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); + F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT)); + Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal); + } + } + + // Some CCs need callee pop. + if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, + MF.getTarget().Options.GuaranteedTailCallOpt)) { + FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. + } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { + // X86 interrupts must pop the error code if present + FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4); + } else { + FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. + // If this is an sret function, the return should pop the hidden pointer. + if (!Is64Bit && !canGuaranteeTCO(CallConv) && + !Subtarget->getTargetTriple().isOSMSVCRT() && + argsAreStructReturn(Ins, Subtarget->isTargetMCU()) == StackStructReturn) + FuncInfo->setBytesToPopOnReturn(4); + } + + if (!Is64Bit) { + // RegSaveFrameIndex is X86-64 only. + FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); + if (CallConv == CallingConv::X86_FastCall || + CallConv == CallingConv::X86_ThisCall) + // fastcc functions can't have varargs. + FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); + } + + FuncInfo->setArgumentStackSize(StackSize); + + if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { + EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn()); + if (Personality == EHPersonality::CoreCLR) { + assert(Is64Bit); + // TODO: Add a mechanism to frame lowering that will allow us to indicate + // that we'd prefer this slot be allocated towards the bottom of the frame + // (i.e. near the stack pointer after allocating the frame). Every + // funclet needs a copy of this slot in its (mostly empty) frame, and the + // offset from the bottom of this and each funclet's frame must be the + // same, so the size of funclets' (mostly empty) frames is dictated by + // how far this slot is from the bottom (since they allocate just enough + // space to accomodate holding this slot at the correct offset). + int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); + EHInfo->PSPSymFrameIdx = PSPSymFI; + } + } + + return Chain; +} + +SDValue +X86TargetLowering::LowerMemOpCallTo(SDValue Chain, + SDValue StackPtr, SDValue Arg, + SDLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags) const { + unsigned LocMemOffset = VA.getLocMemOffset(); + SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + StackPtr, PtrOff); + if (Flags.isByVal()) + return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); + + return DAG.getStore( + Chain, dl, Arg, PtrOff, + MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), + false, false, 0); +} + +/// Emit a load of return address if tail call +/// optimization is performed and it is required. +SDValue +X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, + SDValue &OutRetAddr, SDValue Chain, + bool IsTailCall, bool Is64Bit, + int FPDiff, SDLoc dl) const { + // Adjust the Return address stack slot. + EVT VT = getPointerTy(DAG.getDataLayout()); + OutRetAddr = getReturnAddressFrameIndex(DAG); + + // Load the "old" Return address. + OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), + false, false, false, 0); + return SDValue(OutRetAddr.getNode(), 1); +} + +/// Emit a store of the return address if tail call +/// optimization is performed and it is required (FPDiff!=0). +static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, + SDValue Chain, SDValue RetAddrFrIdx, + EVT PtrVT, unsigned SlotSize, + int FPDiff, SDLoc dl) { + // Store the return address to the appropriate stack slot. + if (!FPDiff) return Chain; + // Calculate the new stack slot for the return address. + int NewReturnAddrFI = + MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, + false); + SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); + Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), NewReturnAddrFI), + false, false, 0); + return Chain; +} + +/// Returns a vector_shuffle mask for an movs{s|d}, movd +/// operation of specified width. +static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, + SDValue V2) { + unsigned NumElems = VT.getVectorNumElements(); + SmallVector<int, 8> Mask; + Mask.push_back(NumElems); + for (unsigned i = 1; i != NumElems; ++i) + Mask.push_back(i); + return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); +} + +SDValue +X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + SDLoc &dl = CLI.DL; + SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; + SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; + SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + CallingConv::ID CallConv = CLI.CallConv; + bool &isTailCall = CLI.IsTailCall; + bool isVarArg = CLI.IsVarArg; + + MachineFunction &MF = DAG.getMachineFunction(); + bool Is64Bit = Subtarget->is64Bit(); + bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); + StructReturnType SR = callIsStructReturn(Outs, Subtarget->isTargetMCU()); + bool IsSibcall = false; + X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); + auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); + + if (CallConv == CallingConv::X86_INTR) + report_fatal_error("X86 interrupts may not be called directly"); + + if (Attr.getValueAsString() == "true") + isTailCall = false; + + if (Subtarget->isPICStyleGOT() && + !MF.getTarget().Options.GuaranteedTailCallOpt) { + // If we are using a GOT, disable tail calls to external symbols with + // default visibility. Tail calling such a symbol requires using a GOT + // relocation, which forces early binding of the symbol. This breaks code + // that require lazy function symbol resolution. Using musttail or + // GuaranteedTailCallOpt will override this. + GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); + if (!G || (!G->getGlobal()->hasLocalLinkage() && + G->getGlobal()->hasDefaultVisibility())) + isTailCall = false; + } + + bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall(); + if (IsMustTail) { + // Force this to be a tail call. The verifier rules are enough to ensure + // that we can lower this successfully without moving the return address + // around. + isTailCall = true; + } else if (isTailCall) { + // Check if it's really possible to do a tail call. + isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, + isVarArg, SR != NotStructReturn, + MF.getFunction()->hasStructRetAttr(), CLI.RetTy, + Outs, OutVals, Ins, DAG); + + // Sibcalls are automatically detected tailcalls which do not require + // ABI changes. + if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) + IsSibcall = true; + + if (isTailCall) + ++NumTailCalls; + } + + assert(!(isVarArg && canGuaranteeTCO(CallConv)) && + "Var args not supported with calling convention fastcc, ghc or hipe"); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); + + // Allocate shadow area for Win64 + if (IsWin64) + CCInfo.AllocateStack(32, 8); + + CCInfo.AnalyzeCallOperands(Outs, CC_X86); + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); + if (IsSibcall) + // This is a sibcall. The memory operands are available in caller's + // own caller's stack. + NumBytes = 0; + else if (MF.getTarget().Options.GuaranteedTailCallOpt && + canGuaranteeTCO(CallConv)) + NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); + + int FPDiff = 0; + if (isTailCall && !IsSibcall && !IsMustTail) { + // Lower arguments at fp - stackoffset + fpdiff. + unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); + + FPDiff = NumBytesCallerPushed - NumBytes; + + // Set the delta of movement of the returnaddr stackslot. + // But only set if delta is greater than previous delta. + if (FPDiff < X86Info->getTCReturnAddrDelta()) + X86Info->setTCReturnAddrDelta(FPDiff); + } + + unsigned NumBytesToPush = NumBytes; + unsigned NumBytesToPop = NumBytes; + + // If we have an inalloca argument, all stack space has already been allocated + // for us and be right at the top of the stack. We don't support multiple + // arguments passed in memory when using inalloca. + if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { + NumBytesToPush = 0; + if (!ArgLocs.back().isMemLoc()) + report_fatal_error("cannot use inalloca attribute on a register " + "parameter"); + if (ArgLocs.back().getLocMemOffset() != 0) + report_fatal_error("any parameter with the inalloca attribute must be " + "the only memory argument"); + } + + if (!IsSibcall) + Chain = DAG.getCALLSEQ_START( + Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl); + + SDValue RetAddrFrIdx; + // Load return address for tail calls. + if (isTailCall && FPDiff) + Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, + Is64Bit, FPDiff, dl); + + SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + SmallVector<SDValue, 8> MemOpChains; + SDValue StackPtr; + + // Walk the register/memloc assignments, inserting copies/loads. In the case + // of tail call optimization arguments are handle later. + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + // Skip inalloca arguments, they have already been written. + ISD::ArgFlagsTy Flags = Outs[i].Flags; + if (Flags.isInAlloca()) + continue; + + CCValAssign &VA = ArgLocs[i]; + EVT RegVT = VA.getLocVT(); + SDValue Arg = OutVals[i]; + bool isByVal = Flags.isByVal(); + + // Promote the value if needed. + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); + break; + case CCValAssign::AExt: + if (Arg.getValueType().isVector() && + Arg.getValueType().getVectorElementType() == MVT::i1) + Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); + else if (RegVT.is128BitVector()) { + // Special case: passing MMX values in XMM registers. + Arg = DAG.getBitcast(MVT::i64, Arg); + Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); + Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); + } else + Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); + break; + case CCValAssign::BCvt: + Arg = DAG.getBitcast(RegVT, Arg); + break; + case CCValAssign::Indirect: { + // Store the argument. + SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); + int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); + Chain = DAG.getStore( + Chain, dl, Arg, SpillSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, 0); + Arg = SpillSlot; + break; + } + } + + if (VA.isRegLoc()) { + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + if (isVarArg && IsWin64) { + // Win64 ABI requires argument XMM reg to be copied to the corresponding + // shadow reg if callee is a varargs function. + unsigned ShadowReg = 0; + switch (VA.getLocReg()) { + case X86::XMM0: ShadowReg = X86::RCX; break; + case X86::XMM1: ShadowReg = X86::RDX; break; + case X86::XMM2: ShadowReg = X86::R8; break; + case X86::XMM3: ShadowReg = X86::R9; break; + } + if (ShadowReg) + RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); + } + } else if (!IsSibcall && (!isTailCall || isByVal)) { + assert(VA.isMemLoc()); + if (!StackPtr.getNode()) + StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), + getPointerTy(DAG.getDataLayout())); + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, + dl, DAG, VA, Flags)); + } + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); + + if (Subtarget->isPICStyleGOT()) { + // ELF / PIC requires GOT in the EBX register before function calls via PLT + // GOT pointer. + if (!isTailCall) { + RegsToPass.push_back(std::make_pair( + unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), + getPointerTy(DAG.getDataLayout())))); + } else { + // If we are tail calling and generating PIC/GOT style code load the + // address of the callee into ECX. The value in ecx is used as target of + // the tail jump. This is done to circumvent the ebx/callee-saved problem + // for tail calls on PIC/GOT architectures. Normally we would just put the + // address of GOT into ebx and then call target@PLT. But for tail calls + // ebx would be restored (since ebx is callee saved) before jumping to the + // target@PLT. + + // Note: The actual moving to ECX is done further down. + GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); + if (G && !G->getGlobal()->hasLocalLinkage() && + G->getGlobal()->hasDefaultVisibility()) + Callee = LowerGlobalAddress(Callee, DAG); + else if (isa<ExternalSymbolSDNode>(Callee)) + Callee = LowerExternalSymbol(Callee, DAG); + } + } + + if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) { + // From AMD64 ABI document: + // For calls that may call functions that use varargs or stdargs + // (prototype-less calls or calls to functions containing ellipsis (...) in + // the declaration) %al is used as hidden argument to specify the number + // of SSE registers used. The contents of %al do not need to match exactly + // the number of registers, but must be an ubound on the number of SSE + // registers used and is in the range 0 - 8 inclusive. + + // Count the number of XMM registers allocated. + static const MCPhysReg XMMArgRegs[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); + assert((Subtarget->hasSSE1() || !NumXMMRegs) + && "SSE registers cannot be used when SSE is disabled"); + + RegsToPass.push_back(std::make_pair(unsigned(X86::AL), + DAG.getConstant(NumXMMRegs, dl, + MVT::i8))); + } + + if (isVarArg && IsMustTail) { + const auto &Forwards = X86Info->getForwardedMustTailRegParms(); + for (const auto &F : Forwards) { + SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); + RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); + } + } + + // For tail calls lower the arguments to the 'real' stack slots. Sibcalls + // don't need this because the eligibility check rejects calls that require + // shuffling arguments passed in memory. + if (!IsSibcall && isTailCall) { + // Force all the incoming stack arguments to be loaded from the stack + // before any new outgoing arguments are stored to the stack, because the + // outgoing stack slots may alias the incoming argument stack slots, and + // the alias isn't otherwise explicit. This is slightly more conservative + // than necessary, because it means that each store effectively depends + // on every argument instead of just those arguments it would clobber. + SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); + + SmallVector<SDValue, 8> MemOpChains2; + SDValue FIN; + int FI = 0; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + if (VA.isRegLoc()) + continue; + assert(VA.isMemLoc()); + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + // Skip inalloca arguments. They don't require any work. + if (Flags.isInAlloca()) + continue; + // Create frame index. + int32_t Offset = VA.getLocMemOffset()+FPDiff; + uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; + FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); + FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); + + if (Flags.isByVal()) { + // Copy relative to framepointer. + SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); + if (!StackPtr.getNode()) + StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), + getPointerTy(DAG.getDataLayout())); + Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + StackPtr, Source); + + MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, + ArgChain, + Flags, DAG, dl)); + } else { + // Store relative to framepointer. + MemOpChains2.push_back(DAG.getStore( + ArgChain, dl, Arg, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, 0)); + } + } + + if (!MemOpChains2.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); + + // Store the return address to the appropriate stack slot. + Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, + getPointerTy(DAG.getDataLayout()), + RegInfo->getSlotSize(), FPDiff, dl); + } + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into registers. + SDValue InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + + if (DAG.getTarget().getCodeModel() == CodeModel::Large) { + assert(Is64Bit && "Large code model is only legal in 64-bit mode."); + // In the 64-bit large code model, we have to make all calls + // through a register, since the call instruction's 32-bit + // pc-relative offset may not be large enough to hold the whole + // address. + } else if (Callee->getOpcode() == ISD::GlobalAddress) { + // If the callee is a GlobalAddress node (quite common, every direct call + // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack + // it. + GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee); + + // We should use extra load for direct calls to dllimported functions in + // non-JIT mode. + const GlobalValue *GV = G->getGlobal(); + if (!GV->hasDLLImportStorageClass()) { + unsigned char OpFlags = 0; + bool ExtraLoad = false; + unsigned WrapperKind = ISD::DELETED_NODE; + + // On ELF targets, in both X86-64 and X86-32 mode, direct calls to + // external symbols most go through the PLT in PIC mode. If the symbol + // has hidden or protected visibility, or if it is static or local, then + // we don't need to use the PLT - we can directly call it. + if (Subtarget->isTargetELF() && + DAG.getTarget().getRelocationModel() == Reloc::PIC_ && + GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { + OpFlags = X86II::MO_PLT; + } else if (Subtarget->isPICStyleStubAny() && + !GV->isStrongDefinitionForLinker() && + (!Subtarget->getTargetTriple().isMacOSX() || + Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { + // PC-relative references to external symbols should go through $stub, + // unless we're building with the leopard linker or later, which + // automatically synthesizes these stubs. + OpFlags = X86II::MO_DARWIN_STUB; + } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) && + cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) { + // If the function is marked as non-lazy, generate an indirect call + // which loads from the GOT directly. This avoids runtime overhead + // at the cost of eager binding (and one extra byte of encoding). + OpFlags = X86II::MO_GOTPCREL; + WrapperKind = X86ISD::WrapperRIP; + ExtraLoad = true; + } + + Callee = DAG.getTargetGlobalAddress( + GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags); + + // Add a wrapper if needed. + if (WrapperKind != ISD::DELETED_NODE) + Callee = DAG.getNode(X86ISD::WrapperRIP, dl, + getPointerTy(DAG.getDataLayout()), Callee); + // Add extra indirection if needed. + if (ExtraLoad) + Callee = DAG.getLoad( + getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, + false, 0); + } + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { + unsigned char OpFlags = 0; + + // On ELF targets, in either X86-64 or X86-32 mode, direct calls to + // external symbols should go through the PLT. + if (Subtarget->isTargetELF() && + DAG.getTarget().getRelocationModel() == Reloc::PIC_) { + OpFlags = X86II::MO_PLT; + } else if (Subtarget->isPICStyleStubAny() && + (!Subtarget->getTargetTriple().isMacOSX() || + Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { + // PC-relative references to external symbols should go through $stub, + // unless we're building with the leopard linker or later, which + // automatically synthesizes these stubs. + OpFlags = X86II::MO_DARWIN_STUB; + } + + Callee = DAG.getTargetExternalSymbol( + S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags); + } else if (Subtarget->isTarget64BitILP32() && + Callee->getValueType(0) == MVT::i32) { + // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI + Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); + } + + // Returns a chain & a flag for retval copy to use. + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + SmallVector<SDValue, 8> Ops; + + if (!IsSibcall && isTailCall) { + Chain = DAG.getCALLSEQ_END(Chain, + DAG.getIntPtrConstant(NumBytesToPop, dl, true), + DAG.getIntPtrConstant(0, dl, true), InFlag, dl); + InFlag = Chain.getValue(1); + } + + Ops.push_back(Chain); + Ops.push_back(Callee); + + if (isTailCall) + Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32)); + + // Add argument registers to the end of the list so that they are known live + // into the call. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) + Ops.push_back(DAG.getRegister(RegsToPass[i].first, + RegsToPass[i].second.getValueType())); + + // Add a register mask operand representing the call-preserved registers. + const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv); + assert(Mask && "Missing call preserved mask for calling convention"); + + // If this is an invoke in a 32-bit function using a funclet-based + // personality, assume the function clobbers all registers. If an exception + // is thrown, the runtime will not restore CSRs. + // FIXME: Model this more precisely so that we can register allocate across + // the normal edge and spill and fill across the exceptional edge. + if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) { + const Function *CallerFn = MF.getFunction(); + EHPersonality Pers = + CallerFn->hasPersonalityFn() + ? classifyEHPersonality(CallerFn->getPersonalityFn()) + : EHPersonality::Unknown; + if (isFuncletEHPersonality(Pers)) + Mask = RegInfo->getNoPreservedMask(); + } + + Ops.push_back(DAG.getRegisterMask(Mask)); + + if (InFlag.getNode()) + Ops.push_back(InFlag); + + if (isTailCall) { + // We used to do: + //// If this is the first return lowered for this function, add the regs + //// to the liveout set for the function. + // This isn't right, although it's probably harmless on x86; liveouts + // should be computed from returns not tail calls. Consider a void + // function making a tail call to a function returning int. + MF.getFrameInfo()->setHasTailCall(); + return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); + } + + Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); + InFlag = Chain.getValue(1); + + // Create the CALLSEQ_END node. + unsigned NumBytesForCalleeToPop; + if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, + DAG.getTarget().Options.GuaranteedTailCallOpt)) + NumBytesForCalleeToPop = NumBytes; // Callee pops everything + else if (!Is64Bit && !canGuaranteeTCO(CallConv) && + !Subtarget->getTargetTriple().isOSMSVCRT() && + SR == StackStructReturn) + // If this is a call to a struct-return function, the callee + // pops the hidden struct pointer, so we have to push it back. + // This is common for Darwin/X86, Linux & Mingw32 targets. + // For MSVC Win32 targets, the caller pops the hidden struct pointer. + NumBytesForCalleeToPop = 4; + else + NumBytesForCalleeToPop = 0; // Callee pops nothing. + + // Returns a flag for retval copy to use. + if (!IsSibcall) { + Chain = DAG.getCALLSEQ_END(Chain, + DAG.getIntPtrConstant(NumBytesToPop, dl, true), + DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl, + true), + InFlag, dl); + InFlag = Chain.getValue(1); + } + + // Handle result values, copying them out of physregs into vregs that we + // return. + return LowerCallResult(Chain, InFlag, CallConv, isVarArg, + Ins, dl, DAG, InVals); +} + +//===----------------------------------------------------------------------===// +// Fast Calling Convention (tail call) implementation +//===----------------------------------------------------------------------===// + +// Like std call, callee cleans arguments, convention except that ECX is +// reserved for storing the tail called function address. Only 2 registers are +// free for argument passing (inreg). Tail call optimization is performed +// provided: +// * tailcallopt is enabled +// * caller/callee are fastcc +// On X86_64 architecture with GOT-style position independent code only local +// (within module) calls are supported at the moment. +// To keep the stack aligned according to platform abi the function +// GetAlignedArgumentStackSize ensures that argument delta is always multiples +// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) +// If a tail called function callee has more arguments than the caller the +// caller needs to make sure that there is room to move the RETADDR to. This is +// achieved by reserving an area the size of the argument delta right after the +// original RETADDR, but before the saved framepointer or the spilled registers +// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) +// stack layout: +// arg1 +// arg2 +// RETADDR +// [ new RETADDR +// move area ] +// (possible EBP) +// ESI +// EDI +// local1 .. + +/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align +/// requirement. +unsigned +X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, + SelectionDAG& DAG) const { + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); + unsigned StackAlignment = TFI.getStackAlignment(); + uint64_t AlignMask = StackAlignment - 1; + int64_t Offset = StackSize; + unsigned SlotSize = RegInfo->getSlotSize(); + if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { + // Number smaller than 12 so just add the difference. + Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); + } else { + // Mask out lower bits, add stackalignment once plus the 12 bytes. + Offset = ((~AlignMask) & Offset) + StackAlignment + + (StackAlignment-SlotSize); + } + return Offset; +} + +/// Return true if the given stack call argument is already available in the +/// same position (relatively) of the caller's incoming argument stack. +static +bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, + MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, + const X86InstrInfo *TII) { + unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; + int FI = INT_MAX; + if (Arg.getOpcode() == ISD::CopyFromReg) { + unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); + if (!TargetRegisterInfo::isVirtualRegister(VR)) + return false; + MachineInstr *Def = MRI->getVRegDef(VR); + if (!Def) + return false; + if (!Flags.isByVal()) { + if (!TII->isLoadFromStackSlot(Def, FI)) + return false; + } else { + unsigned Opcode = Def->getOpcode(); + if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || + Opcode == X86::LEA64_32r) && + Def->getOperand(1).isFI()) { + FI = Def->getOperand(1).getIndex(); + Bytes = Flags.getByValSize(); + } else + return false; + } + } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { + if (Flags.isByVal()) + // ByVal argument is passed in as a pointer but it's now being + // dereferenced. e.g. + // define @foo(%struct.X* %A) { + // tail call @bar(%struct.X* byval %A) + // } + return false; + SDValue Ptr = Ld->getBasePtr(); + FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); + if (!FINode) + return false; + FI = FINode->getIndex(); + } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { + FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); + FI = FINode->getIndex(); + Bytes = Flags.getByValSize(); + } else + return false; + + assert(FI != INT_MAX); + if (!MFI->isFixedObjectIndex(FI)) + return false; + return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); +} + +/// Check whether the call is eligible for tail call optimization. Targets +/// that want to do tail call optimization should implement this function. +bool X86TargetLowering::IsEligibleForTailCallOptimization( + SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, + bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { + if (!mayTailCallThisCC(CalleeCC)) + return false; + + // If -tailcallopt is specified, make fastcc functions tail-callable. + MachineFunction &MF = DAG.getMachineFunction(); + const Function *CallerF = MF.getFunction(); + + // If the function return type is x86_fp80 and the callee return type is not, + // then the FP_EXTEND of the call result is not a nop. It's not safe to + // perform a tailcall optimization here. + if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) + return false; + + CallingConv::ID CallerCC = CallerF->getCallingConv(); + bool CCMatch = CallerCC == CalleeCC; + bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); + bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC); + + // Win64 functions have extra shadow space for argument homing. Don't do the + // sibcall if the caller and callee have mismatched expectations for this + // space. + if (IsCalleeWin64 != IsCallerWin64) + return false; + + if (DAG.getTarget().Options.GuaranteedTailCallOpt) { + if (canGuaranteeTCO(CalleeCC) && CCMatch) + return true; + return false; + } + + // Look for obvious safe cases to perform tail call optimization that do not + // require ABI changes. This is what gcc calls sibcall. + + // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to + // emit a special epilogue. + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + if (RegInfo->needsStackRealignment(MF)) + return false; + + // Also avoid sibcall optimization if either caller or callee uses struct + // return semantics. + if (isCalleeStructRet || isCallerStructRet) + return false; + + // Do not sibcall optimize vararg calls unless all arguments are passed via + // registers. + if (isVarArg && !Outs.empty()) { + // Optimizing for varargs on Win64 is unlikely to be safe without + // additional testing. + if (IsCalleeWin64 || IsCallerWin64) + return false; + + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); + + CCInfo.AnalyzeCallOperands(Outs, CC_X86); + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) + if (!ArgLocs[i].isRegLoc()) + return false; + } + + // If the call result is in ST0 / ST1, it needs to be popped off the x87 + // stack. Therefore, if it's not used by the call it is not safe to optimize + // this into a sibcall. + bool Unused = false; + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + if (!Ins[i].Used) { + Unused = true; + break; + } + } + if (Unused) { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); + CCInfo.AnalyzeCallResult(Ins, RetCC_X86); + for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { + CCValAssign &VA = RVLocs[i]; + if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) + return false; + } + } + + // If the calling conventions do not match, then we'd better make sure the + // results are returned in the same way as what the caller expects. + if (!CCMatch) { + SmallVector<CCValAssign, 16> RVLocs1; + CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, + *DAG.getContext()); + CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); + + SmallVector<CCValAssign, 16> RVLocs2; + CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, + *DAG.getContext()); + CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); + + if (RVLocs1.size() != RVLocs2.size()) + return false; + for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { + if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) + return false; + if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) + return false; + if (RVLocs1[i].isRegLoc()) { + if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) + return false; + } else { + if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) + return false; + } + } + } + + unsigned StackArgsSize = 0; + + // If the callee takes no arguments then go on to check the results of the + // call. + if (!Outs.empty()) { + // Check if stack adjustment is needed. For now, do not do this if any + // argument is passed on the stack. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); + + // Allocate shadow area for Win64 + if (IsCalleeWin64) + CCInfo.AllocateStack(32, 8); + + CCInfo.AnalyzeCallOperands(Outs, CC_X86); + StackArgsSize = CCInfo.getNextStackOffset(); + + if (CCInfo.getNextStackOffset()) { + // Check if the arguments are already laid out in the right way as + // the caller's fixed stack objects. + MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + const X86InstrInfo *TII = Subtarget->getInstrInfo(); + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + if (VA.getLocInfo() == CCValAssign::Indirect) + return false; + if (!VA.isRegLoc()) { + if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, + MFI, MRI, TII)) + return false; + } + } + } + + // If the tailcall address may be in a register, then make sure it's + // possible to register allocate for it. In 32-bit, the call address can + // only target EAX, EDX, or ECX since the tail call must be scheduled after + // callee-saved registers are restored. These happen to be the same + // registers used to pass 'inreg' arguments so watch out for those. + if (!Subtarget->is64Bit() && + ((!isa<GlobalAddressSDNode>(Callee) && + !isa<ExternalSymbolSDNode>(Callee)) || + DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { + unsigned NumInRegs = 0; + // In PIC we need an extra register to formulate the address computation + // for the callee. + unsigned MaxInRegs = + (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3; + + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + if (!VA.isRegLoc()) + continue; + unsigned Reg = VA.getLocReg(); + switch (Reg) { + default: break; + case X86::EAX: case X86::EDX: case X86::ECX: + if (++NumInRegs == MaxInRegs) + return false; + break; + } + } + } + } + + bool CalleeWillPop = + X86::isCalleePop(CalleeCC, Subtarget->is64Bit(), isVarArg, + MF.getTarget().Options.GuaranteedTailCallOpt); + + if (unsigned BytesToPop = + MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) { + // If we have bytes to pop, the callee must pop them. + bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; + if (!CalleePopMatches) + return false; + } else if (CalleeWillPop && StackArgsSize > 0) { + // If we don't have bytes to pop, make sure the callee doesn't pop any. + return false; + } + + return true; +} + +FastISel * +X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) const { + return X86::createFastISel(funcInfo, libInfo); +} + +//===----------------------------------------------------------------------===// +// Other Lowering Hooks +//===----------------------------------------------------------------------===// + +static bool MayFoldLoad(SDValue Op) { + return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); +} + +static bool MayFoldIntoStore(SDValue Op) { + return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); +} + +static bool isTargetShuffle(unsigned Opcode) { + switch(Opcode) { + default: return false; + case X86ISD::BLENDI: + case X86ISD::PSHUFB: + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::SHUFP: + case X86ISD::PALIGNR: + case X86ISD::MOVLHPS: + case X86ISD::MOVLHPD: + case X86ISD::MOVHLPS: + case X86ISD::MOVLPS: + case X86ISD::MOVLPD: + case X86ISD::MOVSHDUP: + case X86ISD::MOVSLDUP: + case X86ISD::MOVDDUP: + case X86ISD::MOVSS: + case X86ISD::MOVSD: + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + case X86ISD::VPERMILPI: + case X86ISD::VPERM2X128: + case X86ISD::VPERMI: + case X86ISD::VPERMV: + case X86ISD::VPERMV3: + return true; + } +} + +static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT, + SDValue V1, unsigned TargetMask, + SelectionDAG &DAG) { + switch(Opc) { + default: llvm_unreachable("Unknown x86 shuffle node"); + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::VPERMILPI: + case X86ISD::VPERMI: + return DAG.getNode(Opc, dl, VT, V1, + DAG.getConstant(TargetMask, dl, MVT::i8)); + } +} + +static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT, + SDValue V1, SDValue V2, SelectionDAG &DAG) { + switch(Opc) { + default: llvm_unreachable("Unknown x86 shuffle node"); + case X86ISD::MOVLHPS: + case X86ISD::MOVLHPD: + case X86ISD::MOVHLPS: + case X86ISD::MOVLPS: + case X86ISD::MOVLPD: + case X86ISD::MOVSS: + case X86ISD::MOVSD: + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + return DAG.getNode(Opc, dl, VT, V1, V2); + } +} + +SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + int ReturnAddrIndex = FuncInfo->getRAIndex(); + + if (ReturnAddrIndex == 0) { + // Set up a frame object for the return address. + unsigned SlotSize = RegInfo->getSlotSize(); + ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, + -(int64_t)SlotSize, + false); + FuncInfo->setRAIndex(ReturnAddrIndex); + } + + return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout())); +} + +bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, + bool hasSymbolicDisplacement) { + // Offset should fit into 32 bit immediate field. + if (!isInt<32>(Offset)) + return false; + + // If we don't have a symbolic displacement - we don't have any extra + // restrictions. + if (!hasSymbolicDisplacement) + return true; + + // FIXME: Some tweaks might be needed for medium code model. + if (M != CodeModel::Small && M != CodeModel::Kernel) + return false; + + // For small code model we assume that latest object is 16MB before end of 31 + // bits boundary. We may also accept pretty large negative constants knowing + // that all objects are in the positive half of address space. + if (M == CodeModel::Small && Offset < 16*1024*1024) + return true; + + // For kernel code model we know that all object resist in the negative half + // of 32bits address space. We may not accept negative offsets, since they may + // be just off and we may accept pretty large positive ones. + if (M == CodeModel::Kernel && Offset >= 0) + return true; + + return false; +} + +/// Determines whether the callee is required to pop its own arguments. +/// Callee pop is necessary to support tail calls. +bool X86::isCalleePop(CallingConv::ID CallingConv, + bool is64Bit, bool IsVarArg, bool GuaranteeTCO) { + // If GuaranteeTCO is true, we force some calls to be callee pop so that we + // can guarantee TCO. + if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO)) + return true; + + switch (CallingConv) { + default: + return false; + case CallingConv::X86_StdCall: + case CallingConv::X86_FastCall: + case CallingConv::X86_ThisCall: + case CallingConv::X86_VectorCall: + return !is64Bit; + } +} + +/// \brief Return true if the condition is an unsigned comparison operation. +static bool isX86CCUnsigned(unsigned X86CC) { + switch (X86CC) { + default: llvm_unreachable("Invalid integer condition!"); + case X86::COND_E: return true; + case X86::COND_G: return false; + case X86::COND_GE: return false; + case X86::COND_L: return false; + case X86::COND_LE: return false; + case X86::COND_NE: return true; + case X86::COND_B: return true; + case X86::COND_A: return true; + case X86::COND_BE: return true; + case X86::COND_AE: return true; + } +} + +static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { + switch (SetCCOpcode) { + default: llvm_unreachable("Invalid integer condition!"); + case ISD::SETEQ: return X86::COND_E; + case ISD::SETGT: return X86::COND_G; + case ISD::SETGE: return X86::COND_GE; + case ISD::SETLT: return X86::COND_L; + case ISD::SETLE: return X86::COND_LE; + case ISD::SETNE: return X86::COND_NE; + case ISD::SETULT: return X86::COND_B; + case ISD::SETUGT: return X86::COND_A; + case ISD::SETULE: return X86::COND_BE; + case ISD::SETUGE: return X86::COND_AE; + } +} + +/// Do a one-to-one translation of a ISD::CondCode to the X86-specific +/// condition code, returning the condition code and the LHS/RHS of the +/// comparison to make. +static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP, + SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { + if (!isFP) { + if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { + if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { + // X > -1 -> X == 0, jump !sign. + RHS = DAG.getConstant(0, DL, RHS.getValueType()); + return X86::COND_NS; + } + if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { + // X < 0 -> X == 0, jump on sign. + return X86::COND_S; + } + if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { + // X < 1 -> X <= 0 + RHS = DAG.getConstant(0, DL, RHS.getValueType()); + return X86::COND_LE; + } + } + + return TranslateIntegerX86CC(SetCCOpcode); + } + + // First determine if it is required or is profitable to flip the operands. + + // If LHS is a foldable load, but RHS is not, flip the condition. + if (ISD::isNON_EXTLoad(LHS.getNode()) && + !ISD::isNON_EXTLoad(RHS.getNode())) { + SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); + std::swap(LHS, RHS); + } + + switch (SetCCOpcode) { + default: break; + case ISD::SETOLT: + case ISD::SETOLE: + case ISD::SETUGT: + case ISD::SETUGE: + std::swap(LHS, RHS); + break; + } + + // On a floating point condition, the flags are set as follows: + // ZF PF CF op + // 0 | 0 | 0 | X > Y + // 0 | 0 | 1 | X < Y + // 1 | 0 | 0 | X == Y + // 1 | 1 | 1 | unordered + switch (SetCCOpcode) { + default: llvm_unreachable("Condcode should be pre-legalized away"); + case ISD::SETUEQ: + case ISD::SETEQ: return X86::COND_E; + case ISD::SETOLT: // flipped + case ISD::SETOGT: + case ISD::SETGT: return X86::COND_A; + case ISD::SETOLE: // flipped + case ISD::SETOGE: + case ISD::SETGE: return X86::COND_AE; + case ISD::SETUGT: // flipped + case ISD::SETULT: + case ISD::SETLT: return X86::COND_B; + case ISD::SETUGE: // flipped + case ISD::SETULE: + case ISD::SETLE: return X86::COND_BE; + case ISD::SETONE: + case ISD::SETNE: return X86::COND_NE; + case ISD::SETUO: return X86::COND_P; + case ISD::SETO: return X86::COND_NP; + case ISD::SETOEQ: + case ISD::SETUNE: return X86::COND_INVALID; + } +} + +/// Is there a floating point cmov for the specific X86 condition code? +/// Current x86 isa includes the following FP cmov instructions: +/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. +static bool hasFPCMov(unsigned X86CC) { + switch (X86CC) { + default: + return false; + case X86::COND_B: + case X86::COND_BE: + case X86::COND_E: + case X86::COND_P: + case X86::COND_A: + case X86::COND_AE: + case X86::COND_NE: + case X86::COND_NP: + return true; + } +} + +/// Returns true if the target can instruction select the +/// specified FP immediate natively. If false, the legalizer will +/// materialize the FP immediate as a load from a constant pool. +bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { + for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { + if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) + return true; + } + return false; +} + +bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, + ISD::LoadExtType ExtTy, + EVT NewVT) const { + // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF + // relocation target a movq or addq instruction: don't let the load shrink. + SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr(); + if (BasePtr.getOpcode() == X86ISD::WrapperRIP) + if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0))) + return GA->getTargetFlags() != X86II::MO_GOTTPOFF; + return true; +} + +/// \brief Returns true if it is beneficial to convert a load of a constant +/// to just the constant itself. +bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + if (BitSize == 0 || BitSize > 64) + return false; + return true; +} + +bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, + unsigned Index) const { + if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) + return false; + + return (Index == 0 || Index == ResVT.getVectorNumElements()); +} + +bool X86TargetLowering::isCheapToSpeculateCttz() const { + // Speculate cttz only if we can directly use TZCNT. + return Subtarget->hasBMI(); +} + +bool X86TargetLowering::isCheapToSpeculateCtlz() const { + // Speculate ctlz only if we can directly use LZCNT. + return Subtarget->hasLZCNT(); +} + +/// Return true if every element in Mask, beginning +/// from position Pos and ending in Pos+Size is undef. +static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { + for (unsigned i = Pos, e = Pos + Size; i != e; ++i) + if (0 <= Mask[i]) + return false; + return true; +} + +/// Return true if Val is undef or if its value falls within the +/// specified range (L, H]. +static bool isUndefOrInRange(int Val, int Low, int Hi) { + return (Val < 0) || (Val >= Low && Val < Hi); +} + +/// Val is either less than zero (undef) or equal to the specified value. +static bool isUndefOrEqual(int Val, int CmpVal) { + return (Val < 0 || Val == CmpVal); +} + +/// Return true if every element in Mask, beginning +/// from position Pos and ending in Pos+Size, falls within the specified +/// sequential range (Low, Low+Size]. or is undef. +static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, + unsigned Pos, unsigned Size, int Low) { + for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) + if (!isUndefOrEqual(Mask[i], Low)) + return false; + return true; +} + +/// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector +/// extract that is suitable for instruction that extract 128 or 256 bit vectors +static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { + assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); + if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) + return false; + + // The index should be aligned on a vecWidth-bit boundary. + uint64_t Index = + cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); + + MVT VT = N->getSimpleValueType(0); + unsigned ElSize = VT.getVectorElementType().getSizeInBits(); + bool Result = (Index * ElSize) % vecWidth == 0; + + return Result; +} + +/// Return true if the specified INSERT_SUBVECTOR +/// operand specifies a subvector insert that is suitable for input to +/// insertion of 128 or 256-bit subvectors +static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { + assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); + if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) + return false; + // The index should be aligned on a vecWidth-bit boundary. + uint64_t Index = + cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); + + MVT VT = N->getSimpleValueType(0); + unsigned ElSize = VT.getVectorElementType().getSizeInBits(); + bool Result = (Index * ElSize) % vecWidth == 0; + + return Result; +} + +bool X86::isVINSERT128Index(SDNode *N) { + return isVINSERTIndex(N, 128); +} + +bool X86::isVINSERT256Index(SDNode *N) { + return isVINSERTIndex(N, 256); +} + +bool X86::isVEXTRACT128Index(SDNode *N) { + return isVEXTRACTIndex(N, 128); +} + +bool X86::isVEXTRACT256Index(SDNode *N) { + return isVEXTRACTIndex(N, 256); +} + +static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { + assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); + assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) && + "Illegal extract subvector for VEXTRACT"); + + uint64_t Index = + cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); + + MVT VecVT = N->getOperand(0).getSimpleValueType(); + MVT ElVT = VecVT.getVectorElementType(); + + unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); + return Index / NumElemsPerChunk; +} + +static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { + assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); + assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) && + "Illegal insert subvector for VINSERT"); + + uint64_t Index = + cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); + + MVT VecVT = N->getSimpleValueType(0); + MVT ElVT = VecVT.getVectorElementType(); + + unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); + return Index / NumElemsPerChunk; +} + +/// Return the appropriate immediate to extract the specified +/// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions. +unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { + return getExtractVEXTRACTImmediate(N, 128); +} + +/// Return the appropriate immediate to extract the specified +/// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions. +unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { + return getExtractVEXTRACTImmediate(N, 256); +} + +/// Return the appropriate immediate to insert at the specified +/// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions. +unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { + return getInsertVINSERTImmediate(N, 128); +} + +/// Return the appropriate immediate to insert at the specified +/// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions. +unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { + return getInsertVINSERTImmediate(N, 256); +} + +/// Returns true if Elt is a constant zero or a floating point constant +0.0. +bool X86::isZeroNode(SDValue Elt) { + return isNullConstant(Elt) || isNullFPConstant(Elt); +} + +// Build a vector of constants +// Use an UNDEF node if MaskElt == -1. +// Spilt 64-bit constants in the 32-bit mode. +static SDValue getConstVector(ArrayRef<int> Values, MVT VT, + SelectionDAG &DAG, + SDLoc dl, bool IsMask = false) { + + SmallVector<SDValue, 32> Ops; + bool Split = false; + + MVT ConstVecVT = VT; + unsigned NumElts = VT.getVectorNumElements(); + bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); + if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { + ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); + Split = true; + } + + MVT EltVT = ConstVecVT.getVectorElementType(); + for (unsigned i = 0; i < NumElts; ++i) { + bool IsUndef = Values[i] < 0 && IsMask; + SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) : + DAG.getConstant(Values[i], dl, EltVT); + Ops.push_back(OpNode); + if (Split) + Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) : + DAG.getConstant(0, dl, EltVT)); + } + SDValue ConstsNode = DAG.getNode(ISD::BUILD_VECTOR, dl, ConstVecVT, Ops); + if (Split) + ConstsNode = DAG.getBitcast(VT, ConstsNode); + return ConstsNode; +} + +/// Returns a vector of specified type with all zero elements. +static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG, SDLoc dl) { + assert(VT.isVector() && "Expected a vector type"); + + // Always build SSE zero vectors as <4 x i32> bitcasted + // to their dest type. This ensures they get CSE'd. + SDValue Vec; + if (VT.is128BitVector()) { // SSE + if (Subtarget->hasSSE2()) { // SSE2 + SDValue Cst = DAG.getConstant(0, dl, MVT::i32); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + } else { // SSE1 + SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); + } + } else if (VT.is256BitVector()) { // AVX + if (Subtarget->hasInt256()) { // AVX2 + SDValue Cst = DAG.getConstant(0, dl, MVT::i32); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); + } else { + // 256-bit logic and arithmetic instructions in AVX are all + // floating-point, no support for integer ops. Emit fp zeroed vectors. + SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops); + } + } else if (VT.is512BitVector()) { // AVX-512 + SDValue Cst = DAG.getConstant(0, dl, MVT::i32); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, + Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); + } else if (VT.getVectorElementType() == MVT::i1) { + + assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16) + && "Unexpected vector type"); + assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8) + && "Unexpected vector type"); + SDValue Cst = DAG.getConstant(0, dl, MVT::i1); + SmallVector<SDValue, 64> Ops(VT.getVectorNumElements(), Cst); + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + } else + llvm_unreachable("Unexpected vector type"); + + return DAG.getBitcast(VT, Vec); +} + +static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl, + unsigned vectorWidth) { + assert((vectorWidth == 128 || vectorWidth == 256) && + "Unsupported vector width"); + EVT VT = Vec.getValueType(); + EVT ElVT = VT.getVectorElementType(); + unsigned Factor = VT.getSizeInBits()/vectorWidth; + EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, + VT.getVectorNumElements()/Factor); + + // Extract from UNDEF is UNDEF. + if (Vec.getOpcode() == ISD::UNDEF) + return DAG.getUNDEF(ResultVT); + + // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR + unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); + assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); + + // This is the index of the first element of the vectorWidth-bit chunk + // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. + IdxVal &= ~(ElemsPerChunk - 1); + + // If the input is a buildvector just emit a smaller one. + if (Vec.getOpcode() == ISD::BUILD_VECTOR) + return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, + makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk)); + + SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); +} + +/// Generate a DAG to grab 128-bits from a vector > 128 bits. This +/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 +/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 +/// instructions or a simple subregister reference. Idx is an index in the +/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes +/// lowering EXTRACT_VECTOR_ELT operations easier. +static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert((Vec.getValueType().is256BitVector() || + Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); + return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); +} + +/// Generate a DAG to grab 256-bits from a 512-bit vector. +static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); + return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); +} + +static SDValue InsertSubVector(SDValue Result, SDValue Vec, + unsigned IdxVal, SelectionDAG &DAG, + SDLoc dl, unsigned vectorWidth) { + assert((vectorWidth == 128 || vectorWidth == 256) && + "Unsupported vector width"); + // Inserting UNDEF is Result + if (Vec.getOpcode() == ISD::UNDEF) + return Result; + EVT VT = Vec.getValueType(); + EVT ElVT = VT.getVectorElementType(); + EVT ResultVT = Result.getValueType(); + + // Insert the relevant vectorWidth bits. + unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); + assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); + + // This is the index of the first element of the vectorWidth-bit chunk + // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. + IdxVal &= ~(ElemsPerChunk - 1); + + SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); +} + +/// Generate a DAG to put 128-bits into a vector > 128 bits. This +/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or +/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a +/// simple superregister reference. Idx is an index in the 128 bits +/// we want. It need not be aligned to a 128-bit boundary. That makes +/// lowering INSERT_VECTOR_ELT operations easier. +static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); + + // For insertion into the zero index (low half) of a 256-bit vector, it is + // more efficient to generate a blend with immediate instead of an insert*128. + // We are still creating an INSERT_SUBVECTOR below with an undef node to + // extend the subvector to the size of the result vector. Make sure that + // we are not recursing on that node by checking for undef here. + if (IdxVal == 0 && Result.getValueType().is256BitVector() && + Result.getOpcode() != ISD::UNDEF) { + EVT ResultVT = Result.getValueType(); + SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl); + SDValue Undef = DAG.getUNDEF(ResultVT); + SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef, + Vec, ZeroIndex); + + // The blend instruction, and therefore its mask, depend on the data type. + MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT(); + if (ScalarType.isFloatingPoint()) { + // Choose either vblendps (float) or vblendpd (double). + unsigned ScalarSize = ScalarType.getSizeInBits(); + assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type"); + unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f; + SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8); + return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask); + } + + const X86Subtarget &Subtarget = + static_cast<const X86Subtarget &>(DAG.getSubtarget()); + + // AVX2 is needed for 256-bit integer blend support. + // Integers must be cast to 32-bit because there is only vpblendd; + // vpblendw can't be used for this because it has a handicapped mask. + + // If we don't have AVX2, then cast to float. Using a wrong domain blend + // is still more efficient than using the wrong domain vinsertf128 that + // will be created by InsertSubVector(). + MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; + + SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8); + Result = DAG.getBitcast(CastVT, Result); + Vec256 = DAG.getBitcast(CastVT, Vec256); + Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask); + return DAG.getBitcast(ResultVT, Vec256); + } + + return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); +} + +static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { + assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); + return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); +} + +/// Insert i1-subvector to i1-vector. +static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) { + + SDLoc dl(Op); + SDValue Vec = Op.getOperand(0); + SDValue SubVec = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + + if (!isa<ConstantSDNode>(Idx)) + return SDValue(); + + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + if (IdxVal == 0 && Vec.isUndef()) // the operation is legal + return Op; + + MVT OpVT = Op.getSimpleValueType(); + MVT SubVecVT = SubVec.getSimpleValueType(); + unsigned NumElems = OpVT.getVectorNumElements(); + unsigned SubVecNumElems = SubVecVT.getVectorNumElements(); + + assert(IdxVal + SubVecNumElems <= NumElems && + IdxVal % SubVecVT.getSizeInBits() == 0 && + "Unexpected index value in INSERT_SUBVECTOR"); + + // There are 3 possible cases: + // 1. Subvector should be inserted in the lower part (IdxVal == 0) + // 2. Subvector should be inserted in the upper part + // (IdxVal + SubVecNumElems == NumElems) + // 3. Subvector should be inserted in the middle (for example v2i1 + // to v16i1, index 2) + + SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); + SDValue Undef = DAG.getUNDEF(OpVT); + SDValue WideSubVec = + DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, SubVec, ZeroIdx); + if (Vec.isUndef()) + return DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + + if (ISD::isBuildVectorAllZeros(Vec.getNode())) { + unsigned ShiftLeft = NumElems - SubVecNumElems; + unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; + WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec, + DAG.getConstant(ShiftLeft, dl, MVT::i8)); + return ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, OpVT, WideSubVec, + DAG.getConstant(ShiftRight, dl, MVT::i8)) : WideSubVec; + } + + if (IdxVal == 0) { + // Zero lower bits of the Vec + SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); + Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); + // Merge them together + return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec); + } + + // Simple case when we put subvector in the upper part + if (IdxVal + SubVecNumElems == NumElems) { + // Zero upper bits of the Vec + WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); + Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); + return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec); + } + // Subvector should be inserted in the middle - use shuffle + SmallVector<int, 64> Mask; + for (unsigned i = 0; i < NumElems; ++i) + Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ? + i : i + NumElems); + return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask); +} + +/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 +/// instructions. This is used because creating CONCAT_VECTOR nodes of +/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower +/// large BUILD_VECTORS. +static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, + unsigned NumElems, SelectionDAG &DAG, + SDLoc dl) { + SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); + return Insert128BitVector(V, V2, NumElems/2, DAG, dl); +} + +static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, + unsigned NumElems, SelectionDAG &DAG, + SDLoc dl) { + SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); + return Insert256BitVector(V, V2, NumElems/2, DAG, dl); +} + +/// Returns a vector of specified type with all bits set. +/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with +/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. +/// Then bitcast to their original type, ensuring they get CSE'd. +static SDValue getOnesVector(EVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG, SDLoc dl) { + assert(VT.isVector() && "Expected a vector type"); + + SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32); + SDValue Vec; + if (VT.is512BitVector()) { + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, + Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); + } else if (VT.is256BitVector()) { + if (Subtarget->hasInt256()) { // AVX2 + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); + } else { // AVX + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); + } + } else if (VT.is128BitVector()) { + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + } else + llvm_unreachable("Unexpected vector type"); + + return DAG.getBitcast(VT, Vec); +} + +/// Returns a vector_shuffle node for an unpackl operation. +static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, + SDValue V2) { + unsigned NumElems = VT.getVectorNumElements(); + SmallVector<int, 8> Mask; + for (unsigned i = 0, e = NumElems/2; i != e; ++i) { + Mask.push_back(i); + Mask.push_back(i + NumElems); + } + return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); +} + +/// Returns a vector_shuffle node for an unpackh operation. +static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, + SDValue V2) { + unsigned NumElems = VT.getVectorNumElements(); + SmallVector<int, 8> Mask; + for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { + Mask.push_back(i + Half); + Mask.push_back(i + NumElems + Half); + } + return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); +} + +/// Return a vector_shuffle of the specified vector of zero or undef vector. +/// This produces a shuffle where the low element of V2 is swizzled into the +/// zero/undef vector, landing at element Idx. +/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). +static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, + bool IsZero, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = V2.getSimpleValueType(); + SDValue V1 = IsZero + ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); + unsigned NumElems = VT.getVectorNumElements(); + SmallVector<int, 16> MaskVec; + for (unsigned i = 0; i != NumElems; ++i) + // If this is the insertion idx, put the low elt of V2 here. + MaskVec.push_back(i == Idx ? NumElems : i); + return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]); +} + +/// Calculates the shuffle mask corresponding to the target-specific opcode. +/// Returns true if the Mask could be calculated. Sets IsUnary to true if only +/// uses one source. Note that this will set IsUnary for shuffles which use a +/// single input multiple times, and in those cases it will +/// adjust the mask to only have indices within that single input. +/// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero. +static bool getTargetShuffleMask(SDNode *N, MVT VT, + SmallVectorImpl<int> &Mask, bool &IsUnary) { + unsigned NumElems = VT.getVectorNumElements(); + SDValue ImmN; + + IsUnary = false; + bool IsFakeUnary = false; + switch(N->getOpcode()) { + case X86ISD::BLENDI: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + break; + case X86ISD::SHUFP: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + break; + case X86ISD::UNPCKH: + DecodeUNPCKHMask(VT, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + break; + case X86ISD::UNPCKL: + DecodeUNPCKLMask(VT, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + break; + case X86ISD::MOVHLPS: + DecodeMOVHLPSMask(NumElems, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + break; + case X86ISD::MOVLHPS: + DecodeMOVLHPSMask(NumElems, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + break; + case X86ISD::PALIGNR: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + break; + case X86ISD::PSHUFD: + case X86ISD::VPERMILPI: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = true; + break; + case X86ISD::PSHUFHW: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = true; + break; + case X86ISD::PSHUFLW: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = true; + break; + case X86ISD::PSHUFB: { + IsUnary = true; + SDValue MaskNode = N->getOperand(1); + while (MaskNode->getOpcode() == ISD::BITCAST) + MaskNode = MaskNode->getOperand(0); + + if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { + // If we have a build-vector, then things are easy. + MVT VT = MaskNode.getSimpleValueType(); + assert(VT.isVector() && + "Can't produce a non-vector with a build_vector!"); + if (!VT.isInteger()) + return false; + + int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8; + + SmallVector<uint64_t, 32> RawMask; + for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) { + SDValue Op = MaskNode->getOperand(i); + if (Op->getOpcode() == ISD::UNDEF) { + RawMask.push_back((uint64_t)SM_SentinelUndef); + continue; + } + auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()); + if (!CN) + return false; + APInt MaskElement = CN->getAPIntValue(); + + // We now have to decode the element which could be any integer size and + // extract each byte of it. + for (int j = 0; j < NumBytesPerElement; ++j) { + // Note that this is x86 and so always little endian: the low byte is + // the first byte of the mask. + RawMask.push_back(MaskElement.getLoBits(8).getZExtValue()); + MaskElement = MaskElement.lshr(8); + } + } + DecodePSHUFBMask(RawMask, Mask); + break; + } + + auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); + if (!MaskLoad) + return false; + + SDValue Ptr = MaskLoad->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper || + Ptr->getOpcode() == X86ISD::WrapperRIP) + Ptr = Ptr->getOperand(0); + + auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); + if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) + return false; + + if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { + DecodePSHUFBMask(C, Mask); + break; + } + + return false; + } + case X86ISD::VPERMI: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = true; + break; + case X86ISD::MOVSS: + case X86ISD::MOVSD: + DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask); + break; + case X86ISD::VPERM2X128: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + // Mask only contains negative index if an element is zero. + if (std::any_of(Mask.begin(), Mask.end(), + [](int M){ return M == SM_SentinelZero; })) + return false; + break; + case X86ISD::MOVSLDUP: + DecodeMOVSLDUPMask(VT, Mask); + IsUnary = true; + break; + case X86ISD::MOVSHDUP: + DecodeMOVSHDUPMask(VT, Mask); + IsUnary = true; + break; + case X86ISD::MOVDDUP: + DecodeMOVDDUPMask(VT, Mask); + IsUnary = true; + break; + case X86ISD::MOVLHPD: + case X86ISD::MOVLPD: + case X86ISD::MOVLPS: + // Not yet implemented + return false; + case X86ISD::VPERMV: { + IsUnary = true; + SDValue MaskNode = N->getOperand(0); + while (MaskNode->getOpcode() == ISD::BITCAST) + MaskNode = MaskNode->getOperand(0); + + unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()); + SmallVector<uint64_t, 32> RawMask; + if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { + // If we have a build-vector, then things are easy. + assert(MaskNode.getSimpleValueType().isInteger() && + MaskNode.getSimpleValueType().getVectorNumElements() == + VT.getVectorNumElements()); + + for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) { + SDValue Op = MaskNode->getOperand(i); + if (Op->getOpcode() == ISD::UNDEF) + RawMask.push_back((uint64_t)SM_SentinelUndef); + else if (isa<ConstantSDNode>(Op)) { + APInt MaskElement = cast<ConstantSDNode>(Op)->getAPIntValue(); + RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue()); + } else + return false; + } + DecodeVPERMVMask(RawMask, Mask); + break; + } + if (MaskNode->getOpcode() == X86ISD::VBROADCAST) { + unsigned NumEltsInMask = MaskNode->getNumOperands(); + MaskNode = MaskNode->getOperand(0); + if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode)) { + APInt MaskEltValue = CN->getAPIntValue(); + for (unsigned i = 0; i < NumEltsInMask; ++i) + RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue()); + DecodeVPERMVMask(RawMask, Mask); + break; + } + // It may be a scalar load + } + + auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); + if (!MaskLoad) + return false; + + SDValue Ptr = MaskLoad->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper || + Ptr->getOpcode() == X86ISD::WrapperRIP) + Ptr = Ptr->getOperand(0); + + auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); + if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) + return false; + + if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { + DecodeVPERMVMask(C, VT, Mask); + break; + } + return false; + } + case X86ISD::VPERMV3: { + IsUnary = false; + SDValue MaskNode = N->getOperand(1); + while (MaskNode->getOpcode() == ISD::BITCAST) + MaskNode = MaskNode->getOperand(1); + + if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { + // If we have a build-vector, then things are easy. + assert(MaskNode.getSimpleValueType().isInteger() && + MaskNode.getSimpleValueType().getVectorNumElements() == + VT.getVectorNumElements()); + + SmallVector<uint64_t, 32> RawMask; + unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()*2); + + for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) { + SDValue Op = MaskNode->getOperand(i); + if (Op->getOpcode() == ISD::UNDEF) + RawMask.push_back((uint64_t)SM_SentinelUndef); + else { + auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()); + if (!CN) + return false; + APInt MaskElement = CN->getAPIntValue(); + RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue()); + } + } + DecodeVPERMV3Mask(RawMask, Mask); + break; + } + + auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); + if (!MaskLoad) + return false; + + SDValue Ptr = MaskLoad->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper || + Ptr->getOpcode() == X86ISD::WrapperRIP) + Ptr = Ptr->getOperand(0); + + auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); + if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) + return false; + + if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { + DecodeVPERMV3Mask(C, VT, Mask); + break; + } + return false; + } + default: llvm_unreachable("unknown target shuffle node"); + } + + // Empty mask indicates the decode failed. + if (Mask.empty()) + return false; + + // If we have a fake unary shuffle, the shuffle mask is spread across two + // inputs that are actually the same node. Re-map the mask to always point + // into the first input. + if (IsFakeUnary) + for (int &M : Mask) + if (M >= (int)Mask.size()) + M -= Mask.size(); + + return true; +} + +/// Returns the scalar element that will make up the ith +/// element of the result of the vector shuffle. +static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, + unsigned Depth) { + if (Depth == 6) + return SDValue(); // Limit search depth. + + SDValue V = SDValue(N, 0); + EVT VT = V.getValueType(); + unsigned Opcode = V.getOpcode(); + + // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. + if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { + int Elt = SV->getMaskElt(Index); + + if (Elt < 0) + return DAG.getUNDEF(VT.getVectorElementType()); + + unsigned NumElems = VT.getVectorNumElements(); + SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) + : SV->getOperand(1); + return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); + } + + // Recurse into target specific vector shuffles to find scalars. + if (isTargetShuffle(Opcode)) { + MVT ShufVT = V.getSimpleValueType(); + unsigned NumElems = ShufVT.getVectorNumElements(); + SmallVector<int, 16> ShuffleMask; + bool IsUnary; + + if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) + return SDValue(); + + int Elt = ShuffleMask[Index]; + if (Elt < 0) + return DAG.getUNDEF(ShufVT.getVectorElementType()); + + SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) + : N->getOperand(1); + return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, + Depth+1); + } + + // Actual nodes that may contain scalar elements + if (Opcode == ISD::BITCAST) { + V = V.getOperand(0); + EVT SrcVT = V.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) + return SDValue(); + } + + if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) + return (Index == 0) ? V.getOperand(0) + : DAG.getUNDEF(VT.getVectorElementType()); + + if (V.getOpcode() == ISD::BUILD_VECTOR) + return V.getOperand(Index); + + return SDValue(); +} + +/// Custom lower build_vector of v16i8. +static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, + unsigned NumNonZero, unsigned NumZero, + SelectionDAG &DAG, + const X86Subtarget* Subtarget, + const TargetLowering &TLI) { + if (NumNonZero > 8) + return SDValue(); + + SDLoc dl(Op); + SDValue V; + bool First = true; + + // SSE4.1 - use PINSRB to insert each byte directly. + if (Subtarget->hasSSE41()) { + for (unsigned i = 0; i < 16; ++i) { + bool isNonZero = (NonZeros & (1 << i)) != 0; + if (isNonZero) { + if (First) { + if (NumZero) + V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl); + else + V = DAG.getUNDEF(MVT::v16i8); + First = false; + } + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, + MVT::v16i8, V, Op.getOperand(i), + DAG.getIntPtrConstant(i, dl)); + } + } + + return V; + } + + // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. + for (unsigned i = 0; i < 16; ++i) { + bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; + if (ThisIsNonZero && First) { + if (NumZero) + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); + else + V = DAG.getUNDEF(MVT::v8i16); + First = false; + } + + if ((i & 1) != 0) { + SDValue ThisElt, LastElt; + bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; + if (LastIsNonZero) { + LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, + MVT::i16, Op.getOperand(i-1)); + } + if (ThisIsNonZero) { + ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); + ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, + ThisElt, DAG.getConstant(8, dl, MVT::i8)); + if (LastIsNonZero) + ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); + } else + ThisElt = LastElt; + + if (ThisElt.getNode()) + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, + DAG.getIntPtrConstant(i/2, dl)); + } + } + + return DAG.getBitcast(MVT::v16i8, V); +} + +/// Custom lower build_vector of v8i16. +static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, + unsigned NumNonZero, unsigned NumZero, + SelectionDAG &DAG, + const X86Subtarget* Subtarget, + const TargetLowering &TLI) { + if (NumNonZero > 4) + return SDValue(); + + SDLoc dl(Op); + SDValue V; + bool First = true; + for (unsigned i = 0; i < 8; ++i) { + bool isNonZero = (NonZeros & (1 << i)) != 0; + if (isNonZero) { + if (First) { + if (NumZero) + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); + else + V = DAG.getUNDEF(MVT::v8i16); + First = false; + } + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, + MVT::v8i16, V, Op.getOperand(i), + DAG.getIntPtrConstant(i, dl)); + } + } + + return V; +} + +/// Custom lower build_vector of v4i32 or v4f32. +static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, + const X86Subtarget *Subtarget, + const TargetLowering &TLI) { + // Find all zeroable elements. + std::bitset<4> Zeroable; + for (int i=0; i < 4; ++i) { + SDValue Elt = Op->getOperand(i); + Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)); + } + assert(Zeroable.size() - Zeroable.count() > 1 && + "We expect at least two non-zero elements!"); + + // We only know how to deal with build_vector nodes where elements are either + // zeroable or extract_vector_elt with constant index. + SDValue FirstNonZero; + unsigned FirstNonZeroIdx; + for (unsigned i=0; i < 4; ++i) { + if (Zeroable[i]) + continue; + SDValue Elt = Op->getOperand(i); + if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa<ConstantSDNode>(Elt.getOperand(1))) + return SDValue(); + // Make sure that this node is extracting from a 128-bit vector. + MVT VT = Elt.getOperand(0).getSimpleValueType(); + if (!VT.is128BitVector()) + return SDValue(); + if (!FirstNonZero.getNode()) { + FirstNonZero = Elt; + FirstNonZeroIdx = i; + } + } + + assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!"); + SDValue V1 = FirstNonZero.getOperand(0); + MVT VT = V1.getSimpleValueType(); + + // See if this build_vector can be lowered as a blend with zero. + SDValue Elt; + unsigned EltMaskIdx, EltIdx; + int Mask[4]; + for (EltIdx = 0; EltIdx < 4; ++EltIdx) { + if (Zeroable[EltIdx]) { + // The zero vector will be on the right hand side. + Mask[EltIdx] = EltIdx+4; + continue; + } + + Elt = Op->getOperand(EltIdx); + // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. + EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue(); + if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) + break; + Mask[EltIdx] = EltIdx; + } + + if (EltIdx == 4) { + // Let the shuffle legalizer deal with blend operations. + SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); + if (V1.getSimpleValueType() != VT) + V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1); + return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]); + } + + // See if we can lower this build_vector to a INSERTPS. + if (!Subtarget->hasSSE41()) + return SDValue(); + + SDValue V2 = Elt.getOperand(0); + if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx) + V1 = SDValue(); + + bool CanFold = true; + for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) { + if (Zeroable[i]) + continue; + + SDValue Current = Op->getOperand(i); + SDValue SrcVector = Current->getOperand(0); + if (!V1.getNode()) + V1 = SrcVector; + CanFold = SrcVector == V1 && + cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i; + } + + if (!CanFold) + return SDValue(); + + assert(V1.getNode() && "Expected at least two non-zero elements!"); + if (V1.getSimpleValueType() != MVT::v4f32) + V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1); + if (V2.getSimpleValueType() != MVT::v4f32) + V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2); + + // Ok, we can emit an INSERTPS instruction. + unsigned ZMask = Zeroable.to_ulong(); + + unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; + assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); + SDLoc DL(Op); + SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, + DAG.getIntPtrConstant(InsertPSMask, DL)); + return DAG.getBitcast(VT, Result); +} + +/// Return a vector logical shift node. +static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, + unsigned NumBits, SelectionDAG &DAG, + const TargetLowering &TLI, SDLoc dl) { + assert(VT.is128BitVector() && "Unknown type for VShift"); + MVT ShVT = MVT::v2i64; + unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; + SrcOp = DAG.getBitcast(ShVT, SrcOp); + MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT); + assert(NumBits % 8 == 0 && "Only support byte sized shifts"); + SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy); + return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); +} + +static SDValue +LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { + + // Check if the scalar load can be widened into a vector load. And if + // the address is "base + cst" see if the cst can be "absorbed" into + // the shuffle mask. + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { + SDValue Ptr = LD->getBasePtr(); + if (!ISD::isNormalLoad(LD) || LD->isVolatile()) + return SDValue(); + EVT PVT = LD->getValueType(0); + if (PVT != MVT::i32 && PVT != MVT::f32) + return SDValue(); + + int FI = -1; + int64_t Offset = 0; + if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { + FI = FINode->getIndex(); + Offset = 0; + } else if (DAG.isBaseWithConstantOffset(Ptr) && + isa<FrameIndexSDNode>(Ptr.getOperand(0))) { + FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); + Offset = Ptr.getConstantOperandVal(1); + Ptr = Ptr.getOperand(0); + } else { + return SDValue(); + } + + // FIXME: 256-bit vector instructions don't require a strict alignment, + // improve this code to support it better. + unsigned RequiredAlign = VT.getSizeInBits()/8; + SDValue Chain = LD->getChain(); + // Make sure the stack object alignment is at least 16 or 32. + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { + if (MFI->isFixedObjectIndex(FI)) { + // Can't change the alignment. FIXME: It's possible to compute + // the exact stack offset and reference FI + adjust offset instead. + // If someone *really* cares about this. That's the way to implement it. + return SDValue(); + } else { + MFI->setObjectAlignment(FI, RequiredAlign); + } + } + + // (Offset % 16 or 32) must be multiple of 4. Then address is then + // Ptr + (Offset & ~15). + if (Offset < 0) + return SDValue(); + if ((Offset % RequiredAlign) & 3) + return SDValue(); + int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1); + if (StartOffset) { + SDLoc DL(Ptr); + Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(StartOffset, DL, Ptr.getValueType())); + } + + int EltNo = (Offset - StartOffset) >> 2; + unsigned NumElems = VT.getVectorNumElements(); + + EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); + SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, + LD->getPointerInfo().getWithOffset(StartOffset), + false, false, false, 0); + + SmallVector<int, 8> Mask(NumElems, EltNo); + + return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); + } + + return SDValue(); +} + +/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the +/// elements can be replaced by a single large load which has the same value as +/// a build_vector or insert_subvector whose loaded operands are 'Elts'. +/// +/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a +/// +/// FIXME: we'd also like to handle the case where the last elements are zero +/// rather than undef via VZEXT_LOAD, but we do not detect that case today. +/// There's even a handy isZeroNode for that purpose. +static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, + SDLoc &DL, SelectionDAG &DAG, + bool isAfterLegalize) { + unsigned NumElems = Elts.size(); + + LoadSDNode *LDBase = nullptr; + unsigned LastLoadedElt = -1U; + + // For each element in the initializer, see if we've found a load or an undef. + // If we don't find an initial load element, or later load elements are + // non-consecutive, bail out. + for (unsigned i = 0; i < NumElems; ++i) { + SDValue Elt = Elts[i]; + // Look through a bitcast. + if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST) + Elt = Elt.getOperand(0); + if (!Elt.getNode() || + (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) + return SDValue(); + if (!LDBase) { + if (Elt.getNode()->getOpcode() == ISD::UNDEF) + return SDValue(); + LDBase = cast<LoadSDNode>(Elt.getNode()); + LastLoadedElt = i; + continue; + } + if (Elt.getOpcode() == ISD::UNDEF) + continue; + + LoadSDNode *LD = cast<LoadSDNode>(Elt); + EVT LdVT = Elt.getValueType(); + // Each loaded element must be the correct fractional portion of the + // requested vector load. + if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems) + return SDValue(); + if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i)) + return SDValue(); + LastLoadedElt = i; + } + + // If we have found an entire vector of loads and undefs, then return a large + // load of the entire vector width starting at the base pointer. If we found + // consecutive loads for the low half, generate a vzext_load node. + if (LastLoadedElt == NumElems - 1) { + assert(LDBase && "Did not find base load for merging consecutive loads"); + EVT EltVT = LDBase->getValueType(0); + // Ensure that the input vector size for the merged loads matches the + // cumulative size of the input elements. + if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems) + return SDValue(); + + if (isAfterLegalize && + !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT)) + return SDValue(); + + SDValue NewLd = SDValue(); + + NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getPointerInfo(), LDBase->isVolatile(), + LDBase->isNonTemporal(), LDBase->isInvariant(), + LDBase->getAlignment()); + + if (LDBase->hasAnyUseOfValue(1)) { + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + SDValue(LDBase, 1), + SDValue(NewLd.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), + SDValue(NewLd.getNode(), 1)); + } + + return NewLd; + } + + //TODO: The code below fires only for for loading the low v2i32 / v2f32 + //of a v4i32 / v4f32. It's probably worth generalizing. + EVT EltVT = VT.getVectorElementType(); + if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) && + DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { + SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); + SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64, + LDBase->getPointerInfo(), + LDBase->getAlignment(), + false/*isVolatile*/, true/*ReadMem*/, + false/*WriteMem*/); + + // Make sure the newly-created LOAD is in the same position as LDBase in + // terms of dependency. We create a TokenFactor for LDBase and ResNode, and + // update uses of LDBase's output chain to use the TokenFactor. + if (LDBase->hasAnyUseOfValue(1)) { + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), + SDValue(ResNode.getNode(), 1)); + } + + return DAG.getBitcast(VT, ResNode); + } + return SDValue(); +} + +/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction +/// to generate a splat value for the following cases: +/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. +/// 2. A splat shuffle which uses a scalar_to_vector node which comes from +/// a scalar load, or a constant. +/// The VBROADCAST node is returned when a pattern is found, +/// or SDValue() otherwise. +static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, + SelectionDAG &DAG) { + // VBROADCAST requires AVX. + // TODO: Splats could be generated for non-AVX CPUs using SSE + // instructions, but there's less potential gain for only 128-bit vectors. + if (!Subtarget->hasAVX()) + return SDValue(); + + MVT VT = Op.getSimpleValueType(); + SDLoc dl(Op); + + assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && + "Unsupported vector type for broadcast."); + + SDValue Ld; + bool ConstSplatVal; + + switch (Op.getOpcode()) { + default: + // Unknown pattern found. + return SDValue(); + + case ISD::BUILD_VECTOR: { + auto *BVOp = cast<BuildVectorSDNode>(Op.getNode()); + BitVector UndefElements; + SDValue Splat = BVOp->getSplatValue(&UndefElements); + + // We need a splat of a single value to use broadcast, and it doesn't + // make any sense if the value is only in one element of the vector. + if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1) + return SDValue(); + + Ld = Splat; + ConstSplatVal = (Ld.getOpcode() == ISD::Constant || + Ld.getOpcode() == ISD::ConstantFP); + + // Make sure that all of the users of a non-constant load are from the + // BUILD_VECTOR node. + if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) + return SDValue(); + break; + } + + case ISD::VECTOR_SHUFFLE: { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + + // Shuffles must have a splat mask where the first element is + // broadcasted. + if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) + return SDValue(); + + SDValue Sc = Op.getOperand(0); + if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && + Sc.getOpcode() != ISD::BUILD_VECTOR) { + + if (!Subtarget->hasInt256()) + return SDValue(); + + // Use the register form of the broadcast instruction available on AVX2. + if (VT.getSizeInBits() >= 256) + Sc = Extract128BitVector(Sc, 0, DAG, dl); + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); + } + + Ld = Sc.getOperand(0); + ConstSplatVal = (Ld.getOpcode() == ISD::Constant || + Ld.getOpcode() == ISD::ConstantFP); + + // The scalar_to_vector node and the suspected + // load node must have exactly one user. + // Constants may have multiple users. + + // AVX-512 has register version of the broadcast + bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() && + Ld.getValueType().getSizeInBits() >= 32; + if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) && + !hasRegVer)) + return SDValue(); + break; + } + } + + unsigned ScalarSize = Ld.getValueType().getSizeInBits(); + bool IsGE256 = (VT.getSizeInBits() >= 256); + + // When optimizing for size, generate up to 5 extra bytes for a broadcast + // instruction to save 8 or more bytes of constant pool data. + // TODO: If multiple splats are generated to load the same constant, + // it may be detrimental to overall size. There needs to be a way to detect + // that condition to know if this is truly a size win. + bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize(); + + // Handle broadcasting a single constant scalar from the constant pool + // into a vector. + // On Sandybridge (no AVX2), it is still better to load a constant vector + // from the constant pool and not to broadcast it from a scalar. + // But override that restriction when optimizing for size. + // TODO: Check if splatting is recommended for other AVX-capable CPUs. + if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) { + EVT CVT = Ld.getValueType(); + assert(!CVT.isVector() && "Must not broadcast a vector type"); + + // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. + // For size optimization, also splat v2f64 and v2i64, and for size opt + // with AVX2, also splat i8 and i16. + // With pattern matching, the VBROADCAST node may become a VMOVDDUP. + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || + (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) { + const Constant *C = nullptr; + if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) + C = CI->getConstantIntValue(); + else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) + C = CF->getConstantFPValue(); + + assert(C && "Invalid constant type"); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue CP = + DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); + unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); + Ld = DAG.getLoad( + CVT, dl, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, Alignment); + + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + } + } + + bool IsLoad = ISD::isNormalLoad(Ld.getNode()); + + // Handle AVX2 in-register broadcasts. + if (!IsLoad && Subtarget->hasInt256() && + (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + + // The scalar source must be a normal load. + if (!IsLoad) + return SDValue(); + + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || + (Subtarget->hasVLX() && ScalarSize == 64)) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + + // The integer check is needed for the 64-bit into 128-bit so it doesn't match + // double since there is no vbroadcastsd xmm + if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) { + if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + } + + // Unsupported broadcast. + return SDValue(); +} + +/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real +/// underlying vector and index. +/// +/// Modifies \p ExtractedFromVec to the real vector and returns the real +/// index. +static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, + SDValue ExtIdx) { + int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); + if (!isa<ShuffleVectorSDNode>(ExtractedFromVec)) + return Idx; + + // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already + // lowered this: + // (extract_vector_elt (v8f32 %vreg1), Constant<6>) + // to: + // (extract_vector_elt (vector_shuffle<2,u,u,u> + // (extract_subvector (v8f32 %vreg0), Constant<4>), + // undef) + // Constant<0>) + // In this case the vector is the extract_subvector expression and the index + // is 2, as specified by the shuffle. + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec); + SDValue ShuffleVec = SVOp->getOperand(0); + MVT ShuffleVecVT = ShuffleVec.getSimpleValueType(); + assert(ShuffleVecVT.getVectorElementType() == + ExtractedFromVec.getSimpleValueType().getVectorElementType()); + + int ShuffleIdx = SVOp->getMaskElt(Idx); + if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) { + ExtractedFromVec = ShuffleVec; + return ShuffleIdx; + } + return Idx; +} + +static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + + // Skip if insert_vec_elt is not supported. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) + return SDValue(); + + SDLoc DL(Op); + unsigned NumElems = Op.getNumOperands(); + + SDValue VecIn1; + SDValue VecIn2; + SmallVector<unsigned, 4> InsertIndices; + SmallVector<int, 8> Mask(NumElems, -1); + + for (unsigned i = 0; i != NumElems; ++i) { + unsigned Opc = Op.getOperand(i).getOpcode(); + + if (Opc == ISD::UNDEF) + continue; + + if (Opc != ISD::EXTRACT_VECTOR_ELT) { + // Quit if more than 1 elements need inserting. + if (InsertIndices.size() > 1) + return SDValue(); + + InsertIndices.push_back(i); + continue; + } + + SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); + SDValue ExtIdx = Op.getOperand(i).getOperand(1); + // Quit if non-constant index. + if (!isa<ConstantSDNode>(ExtIdx)) + return SDValue(); + int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx); + + // Quit if extracted from vector of different type. + if (ExtractedFromVec.getValueType() != VT) + return SDValue(); + + if (!VecIn1.getNode()) + VecIn1 = ExtractedFromVec; + else if (VecIn1 != ExtractedFromVec) { + if (!VecIn2.getNode()) + VecIn2 = ExtractedFromVec; + else if (VecIn2 != ExtractedFromVec) + // Quit if more than 2 vectors to shuffle + return SDValue(); + } + + if (ExtractedFromVec == VecIn1) + Mask[i] = Idx; + else if (ExtractedFromVec == VecIn2) + Mask[i] = Idx + NumElems; + } + + if (!VecIn1.getNode()) + return SDValue(); + + VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); + SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]); + for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { + unsigned Idx = InsertIndices[i]; + NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), + DAG.getIntPtrConstant(Idx, DL)); + } + + return NV; +} + +static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) { + assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && + Op.getScalarValueSizeInBits() == 1 && + "Can not convert non-constant vector"); + uint64_t Immediate = 0; + for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { + SDValue In = Op.getOperand(idx); + if (In.getOpcode() != ISD::UNDEF) + Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx; + } + SDLoc dl(Op); + MVT VT = + MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8)); + return DAG.getConstant(Immediate, dl, VT); +} +// Lower BUILD_VECTOR operation for v8i1 and v16i1 types. +SDValue +X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { + + MVT VT = Op.getSimpleValueType(); + assert((VT.getVectorElementType() == MVT::i1) && + "Unexpected type in LowerBUILD_VECTORvXi1!"); + + SDLoc dl(Op); + if (ISD::isBuildVectorAllZeros(Op.getNode())) { + SDValue Cst = DAG.getTargetConstant(0, dl, MVT::i1); + SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + } + + if (ISD::isBuildVectorAllOnes(Op.getNode())) { + SDValue Cst = DAG.getTargetConstant(1, dl, MVT::i1); + SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + } + + if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { + SDValue Imm = ConvertI1VectorToInteger(Op, DAG); + if (Imm.getValueSizeInBits() == VT.getSizeInBits()) + return DAG.getBitcast(VT, Imm); + SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, + DAG.getIntPtrConstant(0, dl)); + } + + // Vector has one or more non-const elements + uint64_t Immediate = 0; + SmallVector<unsigned, 16> NonConstIdx; + bool IsSplat = true; + bool HasConstElts = false; + int SplatIdx = -1; + for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { + SDValue In = Op.getOperand(idx); + if (In.getOpcode() == ISD::UNDEF) + continue; + if (!isa<ConstantSDNode>(In)) + NonConstIdx.push_back(idx); + else { + Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx; + HasConstElts = true; + } + if (SplatIdx == -1) + SplatIdx = idx; + else if (In != Op.getOperand(SplatIdx)) + IsSplat = false; + } + + // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" + if (IsSplat) + return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx), + DAG.getConstant(1, dl, VT), + DAG.getConstant(0, dl, VT)); + + // insert elements one by one + SDValue DstVec; + SDValue Imm; + if (Immediate) { + MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8)); + Imm = DAG.getConstant(Immediate, dl, ImmVT); + } + else if (HasConstElts) + Imm = DAG.getConstant(0, dl, VT); + else + Imm = DAG.getUNDEF(VT); + if (Imm.getValueSizeInBits() == VT.getSizeInBits()) + DstVec = DAG.getBitcast(VT, Imm); + else { + SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); + DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, + DAG.getIntPtrConstant(0, dl)); + } + + for (unsigned i = 0; i < NonConstIdx.size(); ++i) { + unsigned InsertIdx = NonConstIdx[i]; + DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, + Op.getOperand(InsertIdx), + DAG.getIntPtrConstant(InsertIdx, dl)); + } + return DstVec; +} + +/// \brief Return true if \p N implements a horizontal binop and return the +/// operands for the horizontal binop into V0 and V1. +/// +/// This is a helper function of LowerToHorizontalOp(). +/// This function checks that the build_vector \p N in input implements a +/// horizontal operation. Parameter \p Opcode defines the kind of horizontal +/// operation to match. +/// For example, if \p Opcode is equal to ISD::ADD, then this function +/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode +/// is equal to ISD::SUB, then this function checks if this is a horizontal +/// arithmetic sub. +/// +/// This function only analyzes elements of \p N whose indices are +/// in range [BaseIdx, LastIdx). +static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, + SelectionDAG &DAG, + unsigned BaseIdx, unsigned LastIdx, + SDValue &V0, SDValue &V1) { + EVT VT = N->getValueType(0); + + assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); + assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && + "Invalid Vector in input!"); + + bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); + bool CanFold = true; + unsigned ExpectedVExtractIdx = BaseIdx; + unsigned NumElts = LastIdx - BaseIdx; + V0 = DAG.getUNDEF(VT); + V1 = DAG.getUNDEF(VT); + + // Check if N implements a horizontal binop. + for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { + SDValue Op = N->getOperand(i + BaseIdx); + + // Skip UNDEFs. + if (Op->getOpcode() == ISD::UNDEF) { + // Update the expected vector extract index. + if (i * 2 == NumElts) + ExpectedVExtractIdx = BaseIdx; + ExpectedVExtractIdx += 2; + continue; + } + + CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); + + if (!CanFold) + break; + + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + // Try to match the following pattern: + // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1)) + CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Op0.getOperand(0) == Op1.getOperand(0) && + isa<ConstantSDNode>(Op0.getOperand(1)) && + isa<ConstantSDNode>(Op1.getOperand(1))); + if (!CanFold) + break; + + unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); + unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue(); + + if (i * 2 < NumElts) { + if (V0.getOpcode() == ISD::UNDEF) { + V0 = Op0.getOperand(0); + if (V0.getValueType() != VT) + return false; + } + } else { + if (V1.getOpcode() == ISD::UNDEF) { + V1 = Op0.getOperand(0); + if (V1.getValueType() != VT) + return false; + } + if (i * 2 == NumElts) + ExpectedVExtractIdx = BaseIdx; + } + + SDValue Expected = (i * 2 < NumElts) ? V0 : V1; + if (I0 == ExpectedVExtractIdx) + CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected; + else if (IsCommutable && I1 == ExpectedVExtractIdx) { + // Try to match the following dag sequence: + // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I)) + CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected; + } else + CanFold = false; + + ExpectedVExtractIdx += 2; + } + + return CanFold; +} + +/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by +/// a concat_vector. +/// +/// This is a helper function of LowerToHorizontalOp(). +/// This function expects two 256-bit vectors called V0 and V1. +/// At first, each vector is split into two separate 128-bit vectors. +/// Then, the resulting 128-bit vectors are used to implement two +/// horizontal binary operations. +/// +/// The kind of horizontal binary operation is defined by \p X86Opcode. +/// +/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to +/// the two new horizontal binop. +/// When Mode is set, the first horizontal binop dag node would take as input +/// the lower 128-bit of V0 and the upper 128-bit of V0. The second +/// horizontal binop dag node would take as input the lower 128-bit of V1 +/// and the upper 128-bit of V1. +/// Example: +/// HADD V0_LO, V0_HI +/// HADD V1_LO, V1_HI +/// +/// Otherwise, the first horizontal binop dag node takes as input the lower +/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop +/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1. +/// Example: +/// HADD V0_LO, V1_LO +/// HADD V0_HI, V1_HI +/// +/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower +/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to +/// the upper 128-bits of the result. +static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, + SDLoc DL, SelectionDAG &DAG, + unsigned X86Opcode, bool Mode, + bool isUndefLO, bool isUndefHI) { + EVT VT = V0.getValueType(); + assert(VT.is256BitVector() && VT == V1.getValueType() && + "Invalid nodes in input!"); + + unsigned NumElts = VT.getVectorNumElements(); + SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL); + SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL); + SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL); + SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL); + EVT NewVT = V0_LO.getValueType(); + + SDValue LO = DAG.getUNDEF(NewVT); + SDValue HI = DAG.getUNDEF(NewVT); + + if (Mode) { + // Don't emit a horizontal binop if the result is expected to be UNDEF. + if (!isUndefLO && V0->getOpcode() != ISD::UNDEF) + LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); + if (!isUndefHI && V1->getOpcode() != ISD::UNDEF) + HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); + } else { + // Don't emit a horizontal binop if the result is expected to be UNDEF. + if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF || + V1_LO->getOpcode() != ISD::UNDEF)) + LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); + + if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF || + V1_HI->getOpcode() != ISD::UNDEF)) + HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); + } + + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); +} + +/// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB +/// node. +static SDValue LowerToAddSub(const BuildVectorSDNode *BV, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + MVT VT = BV->getSimpleValueType(0); + if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && + (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) + return SDValue(); + + SDLoc DL(BV); + unsigned NumElts = VT.getVectorNumElements(); + SDValue InVec0 = DAG.getUNDEF(VT); + SDValue InVec1 = DAG.getUNDEF(VT); + + assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || + VT == MVT::v2f64) && "build_vector with an invalid type found!"); + + // Odd-numbered elements in the input build vector are obtained from + // adding two integer/float elements. + // Even-numbered elements in the input build vector are obtained from + // subtracting two integer/float elements. + unsigned ExpectedOpcode = ISD::FSUB; + unsigned NextExpectedOpcode = ISD::FADD; + bool AddFound = false; + bool SubFound = false; + + for (unsigned i = 0, e = NumElts; i != e; ++i) { + SDValue Op = BV->getOperand(i); + + // Skip 'undef' values. + unsigned Opcode = Op.getOpcode(); + if (Opcode == ISD::UNDEF) { + std::swap(ExpectedOpcode, NextExpectedOpcode); + continue; + } + + // Early exit if we found an unexpected opcode. + if (Opcode != ExpectedOpcode) + return SDValue(); + + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + // Try to match the following pattern: + // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) + // Early exit if we cannot match that sequence. + if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa<ConstantSDNode>(Op0.getOperand(1)) || + !isa<ConstantSDNode>(Op1.getOperand(1)) || + Op0.getOperand(1) != Op1.getOperand(1)) + return SDValue(); + + unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); + if (I0 != i) + return SDValue(); + + // We found a valid add/sub node. Update the information accordingly. + if (i & 1) + AddFound = true; + else + SubFound = true; + + // Update InVec0 and InVec1. + if (InVec0.getOpcode() == ISD::UNDEF) { + InVec0 = Op0.getOperand(0); + if (InVec0.getSimpleValueType() != VT) + return SDValue(); + } + if (InVec1.getOpcode() == ISD::UNDEF) { + InVec1 = Op1.getOperand(0); + if (InVec1.getSimpleValueType() != VT) + return SDValue(); + } + + // Make sure that operands in input to each add/sub node always + // come from a same pair of vectors. + if (InVec0 != Op0.getOperand(0)) { + if (ExpectedOpcode == ISD::FSUB) + return SDValue(); + + // FADD is commutable. Try to commute the operands + // and then test again. + std::swap(Op0, Op1); + if (InVec0 != Op0.getOperand(0)) + return SDValue(); + } + + if (InVec1 != Op1.getOperand(0)) + return SDValue(); + + // Update the pair of expected opcodes. + std::swap(ExpectedOpcode, NextExpectedOpcode); + } + + // Don't try to fold this build_vector into an ADDSUB if the inputs are undef. + if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF && + InVec1.getOpcode() != ISD::UNDEF) + return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1); + + return SDValue(); +} + +/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible. +static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = BV->getSimpleValueType(0); + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumUndefsLO = 0; + unsigned NumUndefsHI = 0; + unsigned Half = NumElts/2; + + // Count the number of UNDEF operands in the build_vector in input. + for (unsigned i = 0, e = Half; i != e; ++i) + if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) + NumUndefsLO++; + + for (unsigned i = Half, e = NumElts; i != e; ++i) + if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) + NumUndefsHI++; + + // Early exit if this is either a build_vector of all UNDEFs or all the + // operands but one are UNDEF. + if (NumUndefsLO + NumUndefsHI + 1 >= NumElts) + return SDValue(); + + SDLoc DL(BV); + SDValue InVec0, InVec1; + if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) { + // Try to match an SSE3 float HADD/HSUB. + if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) + return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); + + if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) + return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); + } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) { + // Try to match an SSSE3 integer HADD/HSUB. + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) + return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1); + + if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) + return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1); + } + + if (!Subtarget->hasAVX()) + return SDValue(); + + if ((VT == MVT::v8f32 || VT == MVT::v4f64)) { + // Try to match an AVX horizontal add/sub of packed single/double + // precision floating point values from 256-bit vectors. + SDValue InVec2, InVec3; + if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) + return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); + + if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) + return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); + } else if (VT == MVT::v8i32 || VT == MVT::v16i16) { + // Try to match an AVX2 horizontal add/sub of signed integers. + SDValue InVec2, InVec3; + unsigned X86Opcode; + bool CanFold = true; + + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) + X86Opcode = X86ISD::HADD; + else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) && + isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) && + ((InVec0.getOpcode() == ISD::UNDEF || + InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && + ((InVec1.getOpcode() == ISD::UNDEF || + InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) + X86Opcode = X86ISD::HSUB; + else + CanFold = false; + + if (CanFold) { + // Fold this build_vector into a single horizontal add/sub. + // Do this only if the target has AVX2. + if (Subtarget->hasAVX2()) + return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); + + // Do not try to expand this build_vector into a pair of horizontal + // add/sub if we can emit a pair of scalar add/sub. + if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) + return SDValue(); + + // Convert this build_vector into a pair of horizontal binop followed by + // a concat vector. + bool isUndefLO = NumUndefsLO == Half; + bool isUndefHI = NumUndefsHI == Half; + return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false, + isUndefLO, isUndefHI); + } + } + + if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || + VT == MVT::v16i16) && Subtarget->hasAVX()) { + unsigned X86Opcode; + if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) + X86Opcode = X86ISD::HADD; + else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) + X86Opcode = X86ISD::HSUB; + else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) + X86Opcode = X86ISD::FHADD; + else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) + X86Opcode = X86ISD::FHSUB; + else + return SDValue(); + + // Don't try to expand this build_vector into a pair of horizontal add/sub + // if we can simply emit a pair of scalar add/sub. + if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) + return SDValue(); + + // Convert this build_vector into two horizontal add/sub followed by + // a concat vector. + bool isUndefLO = NumUndefsLO == Half; + bool isUndefHI = NumUndefsHI == Half; + return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, + isUndefLO, isUndefHI); + } + + return SDValue(); +} + +SDValue +X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + + MVT VT = Op.getSimpleValueType(); + MVT ExtVT = VT.getVectorElementType(); + unsigned NumElems = Op.getNumOperands(); + + // Generate vectors for predicate vectors. + if (VT.getVectorElementType() == MVT::i1 && Subtarget->hasAVX512()) + return LowerBUILD_VECTORvXi1(Op, DAG); + + // Vectors containing all zeros can be matched by pxor and xorps later + if (ISD::isBuildVectorAllZeros(Op.getNode())) { + // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd + // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. + if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) + return Op; + + return getZeroVector(VT, Subtarget, DAG, dl); + } + + // Vectors containing all ones can be matched by pcmpeqd on 128-bit width + // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use + // vpcmpeqd on 256-bit vectors. + if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { + if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) + return Op; + + if (!VT.is512BitVector()) + return getOnesVector(VT, Subtarget, DAG, dl); + } + + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode()); + if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG)) + return AddSub; + if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) + return HorizontalOp; + if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG)) + return Broadcast; + + unsigned EVTBits = ExtVT.getSizeInBits(); + + unsigned NumZero = 0; + unsigned NumNonZero = 0; + uint64_t NonZeros = 0; + bool IsAllConstants = true; + SmallSet<SDValue, 8> Values; + for (unsigned i = 0; i < NumElems; ++i) { + SDValue Elt = Op.getOperand(i); + if (Elt.getOpcode() == ISD::UNDEF) + continue; + Values.insert(Elt); + if (Elt.getOpcode() != ISD::Constant && + Elt.getOpcode() != ISD::ConstantFP) + IsAllConstants = false; + if (X86::isZeroNode(Elt)) + NumZero++; + else { + assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range. + NonZeros |= ((uint64_t)1 << i); + NumNonZero++; + } + } + + // All undef vector. Return an UNDEF. All zero vectors were handled above. + if (NumNonZero == 0) + return DAG.getUNDEF(VT); + + // Special case for single non-zero, non-undef, element. + if (NumNonZero == 1) { + unsigned Idx = countTrailingZeros(NonZeros); + SDValue Item = Op.getOperand(Idx); + + // If this is an insertion of an i64 value on x86-32, and if the top bits of + // the value are obviously zero, truncate the value to i32 and do the + // insertion that way. Only do this if the value is non-constant or if the + // value is a constant being inserted into element 0. It is cheaper to do + // a constant pool load than it is to do a movd + shuffle. + if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && + (!IsAllConstants || Idx == 0)) { + if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { + // Handle SSE only. + assert(VT == MVT::v2i64 && "Expected an SSE value type!"); + MVT VecVT = MVT::v4i32; + + // Truncate the value (which may itself be a constant) to i32, and + // convert it to a vector with movd (S2V+shuffle to zero extend). + Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); + return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef( + Item, Idx * 2, true, Subtarget, DAG)); + } + } + + // If we have a constant or non-constant insertion into the low element of + // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into + // the rest of the elements. This will be matched as movd/movq/movss/movsd + // depending on what the source datatype is. + if (Idx == 0) { + if (NumZero == 0) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); + + if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || + (ExtVT == MVT::i64 && Subtarget->is64Bit())) { + if (VT.is512BitVector()) { + SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, + Item, DAG.getIntPtrConstant(0, dl)); + } + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Expected an SSE value type!"); + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); + // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. + return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); + } + + // We can't directly insert an i8 or i16 into a vector, so zero extend + // it to i32 first. + if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { + Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); + if (VT.is256BitVector()) { + if (Subtarget->hasAVX()) { + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v8i32, Item); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); + } else { + // Without AVX, we need to extend to a 128-bit vector and then + // insert into the 256-bit vector. + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); + SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); + Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); + } + } else { + assert(VT.is128BitVector() && "Expected an SSE value type!"); + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); + } + return DAG.getBitcast(VT, Item); + } + } + + // Is it a vector logical left shift? + if (NumElems == 2 && Idx == 1 && + X86::isZeroNode(Op.getOperand(0)) && + !X86::isZeroNode(Op.getOperand(1))) { + unsigned NumBits = VT.getSizeInBits(); + return getVShift(true, VT, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + VT, Op.getOperand(1)), + NumBits/2, DAG, *this, dl); + } + + if (IsAllConstants) // Otherwise, it's better to do a constpool load. + return SDValue(); + + // Otherwise, if this is a vector with i32 or f32 elements, and the element + // is a non-constant being inserted into an element other than the low one, + // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka + // movd/movss) to move this into the low element, then shuffle it into + // place. + if (EVTBits == 32) { + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); + return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); + } + } + + // Splat is obviously ok. Let legalizer expand it to a shuffle. + if (Values.size() == 1) { + if (EVTBits == 32) { + // Instead of a shuffle like this: + // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> + // Check if it's possible to issue this instead. + // shuffle (vload ptr)), undef, <1, 1, 1, 1> + unsigned Idx = countTrailingZeros(NonZeros); + SDValue Item = Op.getOperand(Idx); + if (Op.getNode()->isOnlyUserOf(Item.getNode())) + return LowerAsSplatVectorLoad(Item, VT, dl, DAG); + } + return SDValue(); + } + + // A vector full of immediates; various special cases are already + // handled, so this is best done with a single constant-pool load. + if (IsAllConstants) + return SDValue(); + + // For AVX-length vectors, see if we can use a vector load to get all of the + // elements, otherwise build the individual 128-bit pieces and use + // shuffles to put them in place. + if (VT.is256BitVector() || VT.is512BitVector()) { + SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems); + + // Check for a build vector of consecutive loads. + if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) + return LD; + + EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); + + // Build both the lower and upper subvector. + SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, + makeArrayRef(&V[0], NumElems/2)); + SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, + makeArrayRef(&V[NumElems / 2], NumElems/2)); + + // Recreate the wider vector with the lower and upper part. + if (VT.is256BitVector()) + return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); + return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl); + } + + // Let legalizer expand 2-wide build_vectors. + if (EVTBits == 64) { + if (NumNonZero == 1) { + // One half is zero or undef. + unsigned Idx = countTrailingZeros(NonZeros); + SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, + Op.getOperand(Idx)); + return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); + } + return SDValue(); + } + + // If element VT is < 32 bits, convert it to inserts into a zero vector. + if (EVTBits == 8 && NumElems == 16) + if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero, + DAG, Subtarget, *this)) + return V; + + if (EVTBits == 16 && NumElems == 8) + if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero, + DAG, Subtarget, *this)) + return V; + + // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS + if (EVTBits == 32 && NumElems == 4) + if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this)) + return V; + + // If element VT is == 32 bits, turn it into a number of shuffles. + SmallVector<SDValue, 8> V(NumElems); + if (NumElems == 4 && NumZero > 0) { + for (unsigned i = 0; i < 4; ++i) { + bool isZero = !(NonZeros & (1ULL << i)); + if (isZero) + V[i] = getZeroVector(VT, Subtarget, DAG, dl); + else + V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); + } + + for (unsigned i = 0; i < 2; ++i) { + switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { + default: break; + case 0: + V[i] = V[i*2]; // Must be a zero vector. + break; + case 1: + V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); + break; + case 2: + V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); + break; + case 3: + V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); + break; + } + } + + bool Reverse1 = (NonZeros & 0x3) == 2; + bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; + int MaskVec[] = { + Reverse1 ? 1 : 0, + Reverse1 ? 0 : 1, + static_cast<int>(Reverse2 ? NumElems+1 : NumElems), + static_cast<int>(Reverse2 ? NumElems : NumElems+1) + }; + return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); + } + + if (Values.size() > 1 && VT.is128BitVector()) { + // Check for a build vector of consecutive loads. + for (unsigned i = 0; i < NumElems; ++i) + V[i] = Op.getOperand(i); + + // Check for elements which are consecutive loads. + if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) + return LD; + + // Check for a build vector from mostly shuffle plus few inserting. + if (SDValue Sh = buildFromShuffleMostly(Op, DAG)) + return Sh; + + // For SSE 4.1, use insertps to put the high elements into the low element. + if (Subtarget->hasSSE41()) { + SDValue Result; + if (Op.getOperand(0).getOpcode() != ISD::UNDEF) + Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); + else + Result = DAG.getUNDEF(VT); + + for (unsigned i = 1; i < NumElems; ++i) { + if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; + Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, + Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); + } + return Result; + } + + // Otherwise, expand into a number of unpckl*, start by extending each of + // our (non-undef) elements to the full vector width with the element in the + // bottom slot of the vector (which generates no code for SSE). + for (unsigned i = 0; i < NumElems; ++i) { + if (Op.getOperand(i).getOpcode() != ISD::UNDEF) + V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); + else + V[i] = DAG.getUNDEF(VT); + } + + // Next, we iteratively mix elements, e.g. for v4f32: + // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> + // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> + // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> + unsigned EltStride = NumElems >> 1; + while (EltStride != 0) { + for (unsigned i = 0; i < EltStride; ++i) { + // If V[i+EltStride] is undef and this is the first round of mixing, + // then it is safe to just drop this shuffle: V[i] is already in the + // right place, the one element (since it's the first round) being + // inserted as undef can be dropped. This isn't safe for successive + // rounds because they will permute elements within both vectors. + if (V[i+EltStride].getOpcode() == ISD::UNDEF && + EltStride == NumElems/2) + continue; + + V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); + } + EltStride >>= 1; + } + return V[0]; + } + return SDValue(); +} + +// 256-bit AVX can use the vinsertf128 instruction +// to create 256-bit vectors from two other 128-bit ones. +static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { + SDLoc dl(Op); + MVT ResVT = Op.getSimpleValueType(); + + assert((ResVT.is256BitVector() || + ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); + + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + unsigned NumElems = ResVT.getVectorNumElements(); + if (ResVT.is256BitVector()) + return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); + + if (Op.getNumOperands() == 4) { + MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), + ResVT.getVectorNumElements()/2); + SDValue V3 = Op.getOperand(2); + SDValue V4 = Op.getOperand(3); + return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl), + Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl); + } + return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); +} + +static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG & DAG) { + SDLoc dl(Op); + MVT ResVT = Op.getSimpleValueType(); + unsigned NumOfOperands = Op.getNumOperands(); + + assert(isPowerOf2_32(NumOfOperands) && + "Unexpected number of operands in CONCAT_VECTORS"); + + SDValue Undef = DAG.getUNDEF(ResVT); + if (NumOfOperands > 2) { + // Specialize the cases when all, or all but one, of the operands are undef. + unsigned NumOfDefinedOps = 0; + unsigned OpIdx = 0; + for (unsigned i = 0; i < NumOfOperands; i++) + if (!Op.getOperand(i).isUndef()) { + NumOfDefinedOps++; + OpIdx = i; + } + if (NumOfDefinedOps == 0) + return Undef; + if (NumOfDefinedOps == 1) { + unsigned SubVecNumElts = + Op.getOperand(OpIdx).getValueType().getVectorNumElements(); + SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, + Op.getOperand(OpIdx), IdxVal); + } + + MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), + ResVT.getVectorNumElements()/2); + SmallVector<SDValue, 2> Ops; + for (unsigned i = 0; i < NumOfOperands/2; i++) + Ops.push_back(Op.getOperand(i)); + SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); + Ops.clear(); + for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++) + Ops.push_back(Op.getOperand(i)); + SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); + } + + // 2 operands + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + unsigned NumElems = ResVT.getVectorNumElements(); + assert(V1.getValueType() == V2.getValueType() && + V1.getValueType().getVectorNumElements() == NumElems/2 && + "Unexpected operands in CONCAT_VECTORS"); + + if (ResVT.getSizeInBits() >= 16) + return Op; // The operation is legal with KUNPCK + + bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode()); + bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode()); + SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl); + if (IsZeroV1 && IsZeroV2) + return ZeroVec; + + SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); + if (V2.isUndef()) + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); + if (IsZeroV2) + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx); + + SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl); + if (V1.isUndef()) + V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal); + + if (IsZeroV1) + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal); + + V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal); +} + +static SDValue LowerCONCAT_VECTORS(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + if (VT.getVectorElementType() == MVT::i1) + return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG); + + assert((VT.is256BitVector() && Op.getNumOperands() == 2) || + (VT.is512BitVector() && (Op.getNumOperands() == 2 || + Op.getNumOperands() == 4))); + + // AVX can use the vinsertf128 instruction to create 256-bit vectors + // from two other 128-bit ones. + + // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors + return LowerAVXCONCAT_VECTORS(Op, DAG); +} + +//===----------------------------------------------------------------------===// +// Vector shuffle lowering +// +// This is an experimental code path for lowering vector shuffles on x86. It is +// designed to handle arbitrary vector shuffles and blends, gracefully +// degrading performance as necessary. It works hard to recognize idiomatic +// shuffles and lower them to optimal instruction patterns without leaving +// a framework that allows reasonably efficient handling of all vector shuffle +// patterns. +//===----------------------------------------------------------------------===// + +/// \brief Tiny helper function to identify a no-op mask. +/// +/// This is a somewhat boring predicate function. It checks whether the mask +/// array input, which is assumed to be a single-input shuffle mask of the kind +/// used by the X86 shuffle instructions (not a fully general +/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an +/// in-place shuffle are 'no-op's. +static bool isNoopShuffleMask(ArrayRef<int> Mask) { + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] != -1 && Mask[i] != i) + return false; + return true; +} + +/// \brief Helper function to classify a mask as a single-input mask. +/// +/// This isn't a generic single-input test because in the vector shuffle +/// lowering we canonicalize single inputs to be the first input operand. This +/// means we can more quickly test for a single input by only checking whether +/// an input from the second operand exists. We also assume that the size of +/// mask corresponds to the size of the input vectors which isn't true in the +/// fully general case. +static bool isSingleInputShuffleMask(ArrayRef<int> Mask) { + for (int M : Mask) + if (M >= (int)Mask.size()) + return false; + return true; +} + +/// \brief Test whether there are elements crossing 128-bit lanes in this +/// shuffle mask. +/// +/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations +/// and we routinely test for these. +static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { + int LaneSize = 128 / VT.getScalarSizeInBits(); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) + return true; + return false; +} + +/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane. +/// +/// This checks a shuffle mask to see if it is performing the same +/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies +/// that it is also not lane-crossing. It may however involve a blend from the +/// same lane of a second vector. +/// +/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is +/// non-trivial to compute in the face of undef lanes. The representation is +/// *not* suitable for use with existing 128-bit shuffles as it will contain +/// entries from both V1 and V2 inputs to the wider mask. +static bool +is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, + SmallVectorImpl<int> &RepeatedMask) { + int LaneSize = 128 / VT.getScalarSizeInBits(); + RepeatedMask.resize(LaneSize, -1); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) { + if (Mask[i] < 0) + continue; + if ((Mask[i] % Size) / LaneSize != i / LaneSize) + // This entry crosses lanes, so there is no way to model this shuffle. + return false; + + // Ok, handle the in-lane shuffles by detecting if and when they repeat. + if (RepeatedMask[i % LaneSize] == -1) + // This is the first non-undef entry in this slot of a 128-bit lane. + RepeatedMask[i % LaneSize] = + Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size; + else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i]) + // Found a mismatch with the repeated mask. + return false; + } + return true; +} + +/// \brief Checks whether a shuffle mask is equivalent to an explicit list of +/// arguments. +/// +/// This is a fast way to test a shuffle mask against a fixed pattern: +/// +/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... } +/// +/// It returns true if the mask is exactly as wide as the argument list, and +/// each element of the mask is either -1 (signifying undef) or the value given +/// in the argument. +static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask, + ArrayRef<int> ExpectedMask) { + if (Mask.size() != ExpectedMask.size()) + return false; + + int Size = Mask.size(); + + // If the values are build vectors, we can look through them to find + // equivalent inputs that make the shuffles equivalent. + auto *BV1 = dyn_cast<BuildVectorSDNode>(V1); + auto *BV2 = dyn_cast<BuildVectorSDNode>(V2); + + for (int i = 0; i < Size; ++i) + if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) { + auto *MaskBV = Mask[i] < Size ? BV1 : BV2; + auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2; + if (!MaskBV || !ExpectedBV || + MaskBV->getOperand(Mask[i] % Size) != + ExpectedBV->getOperand(ExpectedMask[i] % Size)) + return false; + } + + return true; +} + +/// \brief Get a 4-lane 8-bit shuffle immediate for a mask. +/// +/// This helper function produces an 8-bit shuffle immediate corresponding to +/// the ubiquitous shuffle encoding scheme used in x86 instructions for +/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for +/// example. +/// +/// NB: We rely heavily on "undef" masks preserving the input lane. +static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, + SelectionDAG &DAG) { + assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); + assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); + assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); + assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); + assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); + + unsigned Imm = 0; + Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0; + Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2; + Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4; + Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6; + return DAG.getConstant(Imm, DL, MVT::i8); +} + +/// \brief Compute whether each element of a shuffle is zeroable. +/// +/// A "zeroable" vector shuffle element is one which can be lowered to zero. +/// Either it is an undef element in the shuffle mask, the element of the input +/// referenced is undef, or the element of the input referenced is known to be +/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle +/// as many lanes with this technique as possible to simplify the remaining +/// shuffle. +static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, + SDValue V1, SDValue V2) { + SmallBitVector Zeroable(Mask.size(), false); + + while (V1.getOpcode() == ISD::BITCAST) + V1 = V1->getOperand(0); + while (V2.getOpcode() == ISD::BITCAST) + V2 = V2->getOperand(0); + + bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); + + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + int M = Mask[i]; + // Handle the easy cases. + if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { + Zeroable[i] = true; + continue; + } + + // If this is an index into a build_vector node (which has the same number + // of elements), dig out the input value and use it. + SDValue V = M < Size ? V1 : V2; + if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) + continue; + + SDValue Input = V.getOperand(M % Size); + // The UNDEF opcode check really should be dead code here, but not quite + // worth asserting on (it isn't invalid, just unexpected). + if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) + Zeroable[i] = true; + } + + return Zeroable; +} + +// X86 has dedicated unpack instructions that can handle specific blend +// operations: UNPCKH and UNPCKL. +static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + SelectionDAG &DAG) { + int NumElts = VT.getVectorNumElements(); + int NumEltsInLane = 128 / VT.getScalarSizeInBits(); + SmallVector<int, 8> Unpckl; + SmallVector<int, 8> Unpckh; + + for (int i = 0; i < NumElts; ++i) { + unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; + int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2); + int HiPos = LoPos + NumEltsInLane / 2; + Unpckl.push_back(LoPos); + Unpckh.push_back(HiPos); + } + + if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) + return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) + return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); + + // Commute and try again. + ShuffleVectorSDNode::commuteMask(Unpckl); + if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) + return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1); + + ShuffleVectorSDNode::commuteMask(Unpckh); + if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) + return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1); + + return SDValue(); +} + +/// \brief Try to emit a bitmask instruction for a shuffle. +/// +/// This handles cases where we can model a blend exactly as a bitmask due to +/// one of the inputs being zeroable. +static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + MVT EltVT = VT.getVectorElementType(); + int NumEltBits = EltVT.getSizeInBits(); + MVT IntEltVT = MVT::getIntegerVT(NumEltBits); + SDValue Zero = DAG.getConstant(0, DL, IntEltVT); + SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, + IntEltVT); + if (EltVT.isFloatingPoint()) { + Zero = DAG.getBitcast(EltVT, Zero); + AllOnes = DAG.getBitcast(EltVT, AllOnes); + } + SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + SDValue V; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Zeroable[i]) + continue; + if (Mask[i] % Size != i) + return SDValue(); // Not a blend. + if (!V) + V = Mask[i] < Size ? V1 : V2; + else if (V != (Mask[i] < Size ? V1 : V2)) + return SDValue(); // Can only let one input through the mask. + + VMaskOps[i] = AllOnes; + } + if (!V) + return SDValue(); // No non-zeroable elements! + + SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps); + V = DAG.getNode(VT.isFloatingPoint() + ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND, + DL, VT, V, VMask); + return V; +} + +/// \brief Try to emit a blend instruction for a shuffle using bit math. +/// +/// This is used as a fallback approach when first class blend instructions are +/// unavailable. Currently it is only suitable for integer vectors, but could +/// be generalized for floating point vectors if desirable. +static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(VT.isInteger() && "Only supports integer vector types!"); + MVT EltVT = VT.getVectorElementType(); + int NumEltBits = EltVT.getSizeInBits(); + SDValue Zero = DAG.getConstant(0, DL, EltVT); + SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, + EltVT); + SmallVector<SDValue, 16> MaskOps; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size) + return SDValue(); // Shuffled input! + MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero); + } + + SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps); + V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask); + // We have to cast V2 around. + MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); + V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT, + DAG.getBitcast(MaskVT, V1Mask), + DAG.getBitcast(MaskVT, V2))); + return DAG.getNode(ISD::OR, DL, VT, V1, V2); +} + +/// \brief Try to emit a blend instruction for a shuffle. +/// +/// This doesn't do any checks for the availability of instructions for blending +/// these values. It relies on the availability of the X86ISD::BLENDI pattern to +/// be matched in the backend with the type given. What it does check for is +/// that the shuffle mask is a blend, or convertible into a blend with zero. +static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Original, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); + SmallVector<int, 8> Mask(Original.begin(), Original.end()); + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + bool ForceV1Zero = false, ForceV2Zero = false; + + // Attempt to generate the binary blend mask. If an input is zero then + // we can use any lane. + // TODO: generalize the zero matching to any scalar like isShuffleEquivalent. + unsigned BlendMask = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + int M = Mask[i]; + if (M < 0) + continue; + if (M == i) + continue; + if (M == i + Size) { + BlendMask |= 1u << i; + continue; + } + if (Zeroable[i]) { + if (V1IsZero) { + ForceV1Zero = true; + Mask[i] = i; + continue; + } + if (V2IsZero) { + ForceV2Zero = true; + BlendMask |= 1u << i; + Mask[i] = i + Size; + continue; + } + } + return SDValue(); // Shuffled input! + } + + // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. + if (ForceV1Zero) + V1 = getZeroVector(VT, Subtarget, DAG, DL); + if (ForceV2Zero) + V2 = getZeroVector(VT, Subtarget, DAG, DL); + + auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) { + unsigned ScaledMask = 0; + for (int i = 0; i != Size; ++i) + if (BlendMask & (1u << i)) + for (int j = 0; j != Scale; ++j) + ScaledMask |= 1u << (i * Scale + j); + return ScaledMask; + }; + + switch (VT.SimpleTy) { + case MVT::v2f64: + case MVT::v4f32: + case MVT::v4f64: + case MVT::v8f32: + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, + DAG.getConstant(BlendMask, DL, MVT::i8)); + + case MVT::v4i64: + case MVT::v8i32: + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + // FALLTHROUGH + case MVT::v2i64: + case MVT::v4i32: + // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into + // that instruction. + if (Subtarget->hasAVX2()) { + // Scale the blend by the number of 32-bit dwords per element. + int Scale = VT.getScalarSizeInBits() / 32; + BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); + MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; + V1 = DAG.getBitcast(BlendVT, V1); + V2 = DAG.getBitcast(BlendVT, V2); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, + DAG.getConstant(BlendMask, DL, MVT::i8))); + } + // FALLTHROUGH + case MVT::v8i16: { + // For integer shuffles we need to expand the mask and cast the inputs to + // v8i16s prior to blending. + int Scale = 8 / VT.getVectorNumElements(); + BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); + V1 = DAG.getBitcast(MVT::v8i16, V1); + V2 = DAG.getBitcast(MVT::v8i16, V2); + return DAG.getBitcast(VT, + DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, + DAG.getConstant(BlendMask, DL, MVT::i8))); + } + + case MVT::v16i16: { + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + SmallVector<int, 8> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { + // We can lower these with PBLENDW which is mirrored across 128-bit lanes. + assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); + BlendMask = 0; + for (int i = 0; i < 8; ++i) + if (RepeatedMask[i] >= 16) + BlendMask |= 1u << i; + return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, + DAG.getConstant(BlendMask, DL, MVT::i8)); + } + } + // FALLTHROUGH + case MVT::v16i8: + case MVT::v32i8: { + assert((VT.is128BitVector() || Subtarget->hasAVX2()) && + "256-bit byte-blends require AVX2 support!"); + + // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. + if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG)) + return Masked; + + // Scale the blend by the number of bytes per element. + int Scale = VT.getScalarSizeInBits() / 8; + + // This form of blend is always done on bytes. Compute the byte vector + // type. + MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); + + // Compute the VSELECT mask. Note that VSELECT is really confusing in the + // mix of LLVM's code generator and the x86 backend. We tell the code + // generator that boolean values in the elements of an x86 vector register + // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' + // mapping a select to operand #1, and 'false' mapping to operand #2. The + // reality in x86 is that vector masks (pre-AVX-512) use only the high bit + // of the element (the remaining are ignored) and 0 in that high bit would + // mean operand #1 while 1 in the high bit would mean operand #2. So while + // the LLVM model for boolean values in vector elements gets the relevant + // bit set, it is set backwards and over constrained relative to x86's + // actual model. + SmallVector<SDValue, 32> VSELECTMask; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + for (int j = 0; j < Scale; ++j) + VSELECTMask.push_back( + Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) + : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL, + MVT::i8)); + + V1 = DAG.getBitcast(BlendVT, V1); + V2 = DAG.getBitcast(BlendVT, V2); + return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, DL, BlendVT, + DAG.getNode(ISD::BUILD_VECTOR, DL, + BlendVT, VSELECTMask), + V1, V2)); + } + + default: + llvm_unreachable("Not a supported integer vector type!"); + } +} + +/// \brief Try to lower as a blend of elements from two inputs followed by +/// a single-input permutation. +/// +/// This matches the pattern where we can blend elements from two inputs and +/// then reduce the shuffle to a single-input permutation. +static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + // We build up the blend mask while checking whether a blend is a viable way + // to reduce the shuffle. + SmallVector<int, 32> BlendMask(Mask.size(), -1); + SmallVector<int, 32> PermuteMask(Mask.size(), -1); + + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] < 0) + continue; + + assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds."); + + if (BlendMask[Mask[i] % Size] == -1) + BlendMask[Mask[i] % Size] = Mask[i]; + else if (BlendMask[Mask[i] % Size] != Mask[i]) + return SDValue(); // Can't blend in the needed input! + + PermuteMask[i] = Mask[i] % Size; + } + + SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); + return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); +} + +/// \brief Generic routine to decompose a shuffle and blend into indepndent +/// blends and permutes. +/// +/// This matches the extremely common pattern for handling combined +/// shuffle+blend operations on newer X86 ISAs where we have very fast blend +/// operations. It will try to pick the best arrangement of shuffles and +/// blends. +static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT, + SDValue V1, + SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + // Shuffle the input elements into the desired positions in V1 and V2 and + // blend them together. + SmallVector<int, 32> V1Mask(Mask.size(), -1); + SmallVector<int, 32> V2Mask(Mask.size(), -1); + SmallVector<int, 32> BlendMask(Mask.size(), -1); + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= 0 && Mask[i] < Size) { + V1Mask[i] = Mask[i]; + BlendMask[i] = i; + } else if (Mask[i] >= Size) { + V2Mask[i] = Mask[i] - Size; + BlendMask[i] = i + Size; + } + + // Try to lower with the simpler initial blend strategy unless one of the + // input shuffles would be a no-op. We prefer to shuffle inputs as the + // shuffle may be able to fold with a load or other benefit. However, when + // we'll have to do 2x as many shuffles in order to achieve this, blending + // first is a better strategy. + if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) + if (SDValue BlendPerm = + lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) + return BlendPerm; + + V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); + V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); + return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); +} + +/// \brief Try to lower a vector shuffle as a byte rotation. +/// +/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary +/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use +/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will +/// try to generically lower a vector shuffle through such an pattern. It +/// does not check for the profitability of lowering either as PALIGNR or +/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. +/// This matches shuffle vectors that look like: +/// +/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] +/// +/// Essentially it concatenates V1 and V2, shifts right by some number of +/// elements, and takes the low elements as the result. Note that while this is +/// specified as a *right shift* because x86 is little-endian, it is a *left +/// rotate* of the vector lanes. +static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, + ArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); + + int NumElts = Mask.size(); + int NumLanes = VT.getSizeInBits() / 128; + int NumLaneElts = NumElts / NumLanes; + + // We need to detect various ways of spelling a rotation: + // [11, 12, 13, 14, 15, 0, 1, 2] + // [-1, 12, 13, 14, -1, -1, 1, -1] + // [-1, -1, -1, -1, -1, -1, 1, 2] + // [ 3, 4, 5, 6, 7, 8, 9, 10] + // [-1, 4, 5, 6, -1, -1, 9, -1] + // [-1, 4, 5, 6, -1, -1, -1, -1] + int Rotation = 0; + SDValue Lo, Hi; + for (int l = 0; l < NumElts; l += NumLaneElts) { + for (int i = 0; i < NumLaneElts; ++i) { + if (Mask[l + i] == -1) + continue; + assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!"); + + // Get the mod-Size index and lane correct it. + int LaneIdx = (Mask[l + i] % NumElts) - l; + // Make sure it was in this lane. + if (LaneIdx < 0 || LaneIdx >= NumLaneElts) + return SDValue(); + + // Determine where a rotated vector would have started. + int StartIdx = i - LaneIdx; + if (StartIdx == 0) + // The identity rotation isn't interesting, stop. + return SDValue(); + + // If we found the tail of a vector the rotation must be the missing + // front. If we found the head of a vector, it must be how much of the + // head. + int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx; + + if (Rotation == 0) + Rotation = CandidateRotation; + else if (Rotation != CandidateRotation) + // The rotations don't match, so we can't match this mask. + return SDValue(); + + // Compute which value this mask is pointing at. + SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2; + + // Compute which of the two target values this index should be assigned + // to. This reflects whether the high elements are remaining or the low + // elements are remaining. + SDValue &TargetV = StartIdx < 0 ? Hi : Lo; + + // Either set up this value if we've not encountered it before, or check + // that it remains consistent. + if (!TargetV) + TargetV = MaskV; + else if (TargetV != MaskV) + // This may be a rotation, but it pulls from the inputs in some + // unsupported interleaving. + return SDValue(); + } + } + + // Check that we successfully analyzed the mask, and normalize the results. + assert(Rotation != 0 && "Failed to locate a viable rotation!"); + assert((Lo || Hi) && "Failed to find a rotated input vector!"); + if (!Lo) + Lo = Hi; + else if (!Hi) + Hi = Lo; + + // The actual rotate instruction rotates bytes, so we need to scale the + // rotation based on how many bytes are in the vector lane. + int Scale = 16 / NumLaneElts; + + // SSSE3 targets can use the palignr instruction. + if (Subtarget->hasSSSE3()) { + // Cast the inputs to i8 vector of correct length to match PALIGNR. + MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes); + Lo = DAG.getBitcast(AlignVT, Lo); + Hi = DAG.getBitcast(AlignVT, Hi); + + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Lo, Hi, + DAG.getConstant(Rotation * Scale, DL, MVT::i8))); + } + + assert(VT.is128BitVector() && + "Rotate-based lowering only supports 128-bit lowering!"); + assert(Mask.size() <= 16 && + "Can shuffle at most 16 bytes in a 128-bit vector!"); + + // Default SSE2 implementation + int LoByteShift = 16 - Rotation * Scale; + int HiByteShift = Rotation * Scale; + + // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ. + Lo = DAG.getBitcast(MVT::v2i64, Lo); + Hi = DAG.getBitcast(MVT::v2i64, Hi); + + SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo, + DAG.getConstant(LoByteShift, DL, MVT::i8)); + SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi, + DAG.getConstant(HiByteShift, DL, MVT::i8)); + return DAG.getBitcast(VT, + DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); +} + +/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). +/// +/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and +/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function +/// matches elements from one of the input vectors shuffled to the left or +/// right with zeroable elements 'shifted in'. It handles both the strictly +/// bit-wise element shifts and the byte shift across an entire 128-bit double +/// quad word lane. +/// +/// PSHL : (little-endian) left bit shift. +/// [ zz, 0, zz, 2 ] +/// [ -1, 4, zz, -1 ] +/// PSRL : (little-endian) right bit shift. +/// [ 1, zz, 3, zz] +/// [ -1, -1, 7, zz] +/// PSLLDQ : (little-endian) left byte shift +/// [ zz, 0, 1, 2, 3, 4, 5, 6] +/// [ zz, zz, -1, -1, 2, 3, 4, -1] +/// [ zz, zz, zz, zz, zz, zz, -1, 1] +/// PSRLDQ : (little-endian) right byte shift +/// [ 5, 6, 7, zz, zz, zz, zz, zz] +/// [ -1, 5, 6, 7, zz, zz, zz, zz] +/// [ 1, 2, -1, -1, -1, -1, zz, zz] +static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + int Size = Mask.size(); + assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); + + auto CheckZeros = [&](int Shift, int Scale, bool Left) { + for (int i = 0; i < Size; i += Scale) + for (int j = 0; j < Shift; ++j) + if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))]) + return false; + + return true; + }; + + auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) { + for (int i = 0; i != Size; i += Scale) { + unsigned Pos = Left ? i + Shift : i; + unsigned Low = Left ? i : i + Shift; + unsigned Len = Scale - Shift; + if (!isSequentialOrUndefInRange(Mask, Pos, Len, + Low + (V == V1 ? 0 : Size))) + return SDValue(); + } + + int ShiftEltBits = VT.getScalarSizeInBits() * Scale; + bool ByteShift = ShiftEltBits > 64; + unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI) + : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI); + int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1); + + // Normalize the scale for byte shifts to still produce an i64 element + // type. + Scale = ByteShift ? Scale / 2 : Scale; + + // We need to round trip through the appropriate type for the shift. + MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale); + MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale); + assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && + "Illegal integer vector type"); + V = DAG.getBitcast(ShiftVT, V); + + V = DAG.getNode(OpCode, DL, ShiftVT, V, + DAG.getConstant(ShiftAmt, DL, MVT::i8)); + return DAG.getBitcast(VT, V); + }; + + // SSE/AVX supports logical shifts up to 64-bit integers - so we can just + // keep doubling the size of the integer elements up to that. We can + // then shift the elements of the integer vector by whole multiples of + // their width within the elements of the larger integer vector. Test each + // multiple to see if we can find a match with the moved element indices + // and that the shifted in elements are all zeroable. + for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2) + for (int Shift = 1; Shift != Scale; ++Shift) + for (bool Left : {true, false}) + if (CheckZeros(Shift, Scale, Left)) + for (SDValue V : {V1, V2}) + if (SDValue Match = MatchShift(Shift, Scale, Left, V)) + return Match; + + // no match + return SDValue(); +} + +/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. +static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + assert(!Zeroable.all() && "Fully zeroable shuffle mask"); + + int Size = Mask.size(); + int HalfSize = Size / 2; + assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); + + // Upper half must be undefined. + if (!isUndefInRange(Mask, HalfSize, HalfSize)) + return SDValue(); + + // EXTRQ: Extract Len elements from lower half of source, starting at Idx. + // Remainder of lower half result is zero and upper half is all undef. + auto LowerAsEXTRQ = [&]() { + // Determine the extraction length from the part of the + // lower half that isn't zeroable. + int Len = HalfSize; + for (; Len > 0; --Len) + if (!Zeroable[Len - 1]) + break; + assert(Len > 0 && "Zeroable shuffle mask"); + + // Attempt to match first Len sequential elements from the lower half. + SDValue Src; + int Idx = -1; + for (int i = 0; i != Len; ++i) { + int M = Mask[i]; + if (M < 0) + continue; + SDValue &V = (M < Size ? V1 : V2); + M = M % Size; + + // The extracted elements must start at a valid index and all mask + // elements must be in the lower half. + if (i > M || M >= HalfSize) + return SDValue(); + + if (Idx < 0 || (Src == V && Idx == (M - i))) { + Src = V; + Idx = M - i; + continue; + } + return SDValue(); + } + + if (Idx < 0) + return SDValue(); + + assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); + int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; + int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; + return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src, + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); + }; + + if (SDValue ExtrQ = LowerAsEXTRQ()) + return ExtrQ; + + // INSERTQ: Extract lowest Len elements from lower half of second source and + // insert over first source, starting at Idx. + // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } + auto LowerAsInsertQ = [&]() { + for (int Idx = 0; Idx != HalfSize; ++Idx) { + SDValue Base; + + // Attempt to match first source from mask before insertion point. + if (isUndefInRange(Mask, 0, Idx)) { + /* EMPTY */ + } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { + Base = V1; + } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { + Base = V2; + } else { + continue; + } + + // Extend the extraction length looking to match both the insertion of + // the second source and the remaining elements of the first. + for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { + SDValue Insert; + int Len = Hi - Idx; + + // Match insertion. + if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { + Insert = V1; + } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { + Insert = V2; + } else { + continue; + } + + // Match the remaining elements of the lower half. + if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { + /* EMPTY */ + } else if ((!Base || (Base == V1)) && + isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { + Base = V1; + } else if ((!Base || (Base == V2)) && + isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, + Size + Hi)) { + Base = V2; + } else { + continue; + } + + // We may not have a base (first source) - this can safely be undefined. + if (!Base) + Base = DAG.getUNDEF(VT); + + int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; + int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; + return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert, + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); + } + } + + return SDValue(); + }; + + if (SDValue InsertQ = LowerAsInsertQ()) + return InsertQ; + + return SDValue(); +} + +/// \brief Lower a vector shuffle as a zero or any extension. +/// +/// Given a specific number of elements, element bit width, and extension +/// stride, produce either a zero or any extension based on the available +/// features of the subtarget. The extended elements are consecutive and +/// begin and can start from an offseted element index in the input; to +/// avoid excess shuffling the offset must either being in the bottom lane +/// or at the start of a higher lane. All extended elements must be from +/// the same lane. +static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( + SDLoc DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, + ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { + assert(Scale > 1 && "Need a scale to extend."); + int EltBits = VT.getScalarSizeInBits(); + int NumElements = VT.getVectorNumElements(); + int NumEltsPerLane = 128 / EltBits; + int OffsetLane = Offset / NumEltsPerLane; + assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && + "Only 8, 16, and 32 bit elements can be extended."); + assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); + assert(0 <= Offset && "Extension offset must be positive."); + assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && + "Extension offset must be in the first lane or start an upper lane."); + + // Check that an index is in same lane as the base offset. + auto SafeOffset = [&](int Idx) { + return OffsetLane == (Idx / NumEltsPerLane); + }; + + // Shift along an input so that the offset base moves to the first element. + auto ShuffleOffset = [&](SDValue V) { + if (!Offset) + return V; + + SmallVector<int, 8> ShMask((unsigned)NumElements, -1); + for (int i = 0; i * Scale < NumElements; ++i) { + int SrcIdx = i + Offset; + ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1; + } + return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask); + }; + + // Found a valid zext mask! Try various lowering strategies based on the + // input type and available ISA extensions. + if (Subtarget->hasSSE41()) { + // Not worth offseting 128-bit vectors if scale == 2, a pattern using + // PUNPCK will catch this in a later shuffle match. + if (Offset && Scale == 2 && VT.is128BitVector()) + return SDValue(); + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), + NumElements / Scale); + InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, ShuffleOffset(InputV)); + return DAG.getBitcast(VT, InputV); + } + + assert(VT.is128BitVector() && "Only 128-bit vectors can be extended."); + + // For any extends we can cheat for larger element sizes and use shuffle + // instructions that can fold with a load and/or copy. + if (AnyExt && EltBits == 32) { + int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1, + -1}; + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, + DAG.getBitcast(MVT::v4i32, InputV), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); + } + if (AnyExt && EltBits == 16 && Scale > 2) { + int PSHUFDMask[4] = {Offset / 2, -1, + SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1}; + InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, + DAG.getBitcast(MVT::v4i32, InputV), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); + int PSHUFWMask[4] = {1, -1, -1, -1}; + unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW); + return DAG.getBitcast( + VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16, + DAG.getBitcast(MVT::v8i16, InputV), + getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG))); + } + + // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes + // to 64-bits. + if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) { + assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"); + assert(VT.is128BitVector() && "Unexpected vector width!"); + + int LoIdx = Offset * EltBits; + SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, + DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, + DAG.getConstant(EltBits, DL, MVT::i8), + DAG.getConstant(LoIdx, DL, MVT::i8))); + + if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) || + !SafeOffset(Offset + 1)) + return DAG.getNode(ISD::BITCAST, DL, VT, Lo); + + int HiIdx = (Offset + 1) * EltBits; + SDValue Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, + DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, + DAG.getConstant(EltBits, DL, MVT::i8), + DAG.getConstant(HiIdx, DL, MVT::i8))); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); + } + + // If this would require more than 2 unpack instructions to expand, use + // pshufb when available. We can only use more than 2 unpack instructions + // when zero extending i8 elements which also makes it easier to use pshufb. + if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) { + assert(NumElements == 16 && "Unexpected byte vector width!"); + SDValue PSHUFBMask[16]; + for (int i = 0; i < 16; ++i) { + int Idx = Offset + (i / Scale); + PSHUFBMask[i] = DAG.getConstant( + (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8); + } + InputV = DAG.getBitcast(MVT::v16i8, InputV); + return DAG.getBitcast(VT, + DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, + DAG.getNode(ISD::BUILD_VECTOR, DL, + MVT::v16i8, PSHUFBMask))); + } + + // If we are extending from an offset, ensure we start on a boundary that + // we can unpack from. + int AlignToUnpack = Offset % (NumElements / Scale); + if (AlignToUnpack) { + SmallVector<int, 8> ShMask((unsigned)NumElements, -1); + for (int i = AlignToUnpack; i < NumElements; ++i) + ShMask[i - AlignToUnpack] = i; + InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask); + Offset -= AlignToUnpack; + } + + // Otherwise emit a sequence of unpacks. + do { + unsigned UnpackLoHi = X86ISD::UNPCKL; + if (Offset >= (NumElements / 2)) { + UnpackLoHi = X86ISD::UNPCKH; + Offset -= (NumElements / 2); + } + + MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); + SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) + : getZeroVector(InputVT, Subtarget, DAG, DL); + InputV = DAG.getBitcast(InputVT, InputV); + InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext); + Scale /= 2; + EltBits *= 2; + NumElements /= 2; + } while (Scale > 1); + return DAG.getBitcast(VT, InputV); +} + +/// \brief Try to lower a vector shuffle as a zero extension on any microarch. +/// +/// This routine will try to do everything in its power to cleverly lower +/// a shuffle which happens to match the pattern of a zero extend. It doesn't +/// check for the profitability of this lowering, it tries to aggressively +/// match this pattern. It will use all of the micro-architectural details it +/// can to emit an efficient lowering. It handles both blends with all-zero +/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to +/// masking out later). +/// +/// The reason we have dedicated lowering for zext-style shuffles is that they +/// are both incredibly common and often quite performance sensitive. +static SDValue lowerVectorShuffleAsZeroOrAnyExtend( + SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + int Bits = VT.getSizeInBits(); + int NumLanes = Bits / 128; + int NumElements = VT.getVectorNumElements(); + int NumEltsPerLane = NumElements / NumLanes; + assert(VT.getScalarSizeInBits() <= 32 && + "Exceeds 32-bit integer zero extension limit"); + assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size"); + + // Define a helper function to check a particular ext-scale and lower to it if + // valid. + auto Lower = [&](int Scale) -> SDValue { + SDValue InputV; + bool AnyExt = true; + int Offset = 0; + int Matches = 0; + for (int i = 0; i < NumElements; ++i) { + int M = Mask[i]; + if (M == -1) + continue; // Valid anywhere but doesn't tell us anything. + if (i % Scale != 0) { + // Each of the extended elements need to be zeroable. + if (!Zeroable[i]) + return SDValue(); + + // We no longer are in the anyext case. + AnyExt = false; + continue; + } + + // Each of the base elements needs to be consecutive indices into the + // same input vector. + SDValue V = M < NumElements ? V1 : V2; + M = M % NumElements; + if (!InputV) { + InputV = V; + Offset = M - (i / Scale); + } else if (InputV != V) + return SDValue(); // Flip-flopping inputs. + + // Offset must start in the lowest 128-bit lane or at the start of an + // upper lane. + // FIXME: Is it ever worth allowing a negative base offset? + if (!((0 <= Offset && Offset < NumEltsPerLane) || + (Offset % NumEltsPerLane) == 0)) + return SDValue(); + + // If we are offsetting, all referenced entries must come from the same + // lane. + if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane)) + return SDValue(); + + if ((M % NumElements) != (Offset + (i / Scale))) + return SDValue(); // Non-consecutive strided elements. + Matches++; + } + + // If we fail to find an input, we have a zero-shuffle which should always + // have already been handled. + // FIXME: Maybe handle this here in case during blending we end up with one? + if (!InputV) + return SDValue(); + + // If we are offsetting, don't extend if we only match a single input, we + // can always do better by using a basic PSHUF or PUNPCK. + if (Offset != 0 && Matches < 2) + return SDValue(); + + return lowerVectorShuffleAsSpecificZeroOrAnyExtend( + DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG); + }; + + // The widest scale possible for extending is to a 64-bit integer. + assert(Bits % 64 == 0 && + "The number of bits in a vector must be divisible by 64 on x86!"); + int NumExtElements = Bits / 64; + + // Each iteration, try extending the elements half as much, but into twice as + // many elements. + for (; NumExtElements < NumElements; NumExtElements *= 2) { + assert(NumElements % NumExtElements == 0 && + "The input vector size must be divisible by the extended size."); + if (SDValue V = Lower(NumElements / NumExtElements)) + return V; + } + + // General extends failed, but 128-bit vectors may be able to use MOVQ. + if (Bits != 128) + return SDValue(); + + // Returns one of the source operands if the shuffle can be reduced to a + // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits. + auto CanZExtLowHalf = [&]() { + for (int i = NumElements / 2; i != NumElements; ++i) + if (!Zeroable[i]) + return SDValue(); + if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0)) + return V1; + if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements)) + return V2; + return SDValue(); + }; + + if (SDValue V = CanZExtLowHalf()) { + V = DAG.getBitcast(MVT::v2i64, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V); + return DAG.getBitcast(VT, V); + } + + // No viable ext lowering found. + return SDValue(); +} + +/// \brief Try to get a scalar value for a specific element of a vector. +/// +/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar. +static SDValue getScalarValueForVectorElement(SDValue V, int Idx, + SelectionDAG &DAG) { + MVT VT = V.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + // If the bitcasts shift the element size, we can't extract an equivalent + // element from it. + MVT NewVT = V.getSimpleValueType(); + if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) + return SDValue(); + + if (V.getOpcode() == ISD::BUILD_VECTOR || + (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) { + // Ensure the scalar operand is the same size as the destination. + // FIXME: Add support for scalar truncation where possible. + SDValue S = V.getOperand(Idx); + if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits()) + return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, S); + } + + return SDValue(); +} + +/// \brief Helper to test for a load that can be folded with x86 shuffles. +/// +/// This is particularly important because the set of instructions varies +/// significantly based on whether the operand is a load or not. +static bool isShuffleFoldableLoad(SDValue V) { + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + + return ISD::isNON_EXTLoad(V.getNode()); +} + +/// \brief Try to lower insertion of a single element into a zero vector. +/// +/// This is a common pattern that we have especially efficient patterns to lower +/// across all subtarget feature sets. +static SDValue lowerVectorShuffleAsElementInsertion( + SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + MVT ExtVT = VT; + MVT EltVT = VT.getVectorElementType(); + + int V2Index = std::find_if(Mask.begin(), Mask.end(), + [&Mask](int M) { return M >= (int)Mask.size(); }) - + Mask.begin(); + bool IsV1Zeroable = true; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (i != V2Index && !Zeroable[i]) { + IsV1Zeroable = false; + break; + } + + // Check for a single input from a SCALAR_TO_VECTOR node. + // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and + // all the smarts here sunk into that routine. However, the current + // lowering of BUILD_VECTOR makes that nearly impossible until the old + // vector shuffle lowering is dead. + SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), + DAG); + if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) { + // We need to zext the scalar if it is smaller than an i32. + V2S = DAG.getBitcast(EltVT, V2S); + if (EltVT == MVT::i8 || EltVT == MVT::i16) { + // Using zext to expand a narrow element won't work for non-zero + // insertions. + if (!IsV1Zeroable) + return SDValue(); + + // Zero-extend directly to i32. + ExtVT = MVT::v4i32; + V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); + } + V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); + } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 || + EltVT == MVT::i16) { + // Either not inserting from the low element of the input or the input + // element size is too small to use VZEXT_MOVL to clear the high bits. + return SDValue(); + } + + if (!IsV1Zeroable) { + // If V1 can't be treated as a zero vector we have fewer options to lower + // this. We can't support integer vectors or non-zero targets cheaply, and + // the V1 elements can't be permuted in any way. + assert(VT == ExtVT && "Cannot change extended type when non-zeroable!"); + if (!VT.isFloatingPoint() || V2Index != 0) + return SDValue(); + SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end()); + V1Mask[V2Index] = -1; + if (!isNoopShuffleMask(V1Mask)) + return SDValue(); + // This is essentially a special case blend operation, but if we have + // general purpose blend operations, they are always faster. Bail and let + // the rest of the lowering handle these as blends. + if (Subtarget->hasSSE41()) + return SDValue(); + + // Otherwise, use MOVSD or MOVSS. + assert((EltVT == MVT::f32 || EltVT == MVT::f64) && + "Only two types of floating point element types to handle!"); + return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL, + ExtVT, V1, V2); + } + + // This lowering only works for the low element with floating point vectors. + if (VT.isFloatingPoint() && V2Index != 0) + return SDValue(); + + V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); + if (ExtVT != VT) + V2 = DAG.getBitcast(VT, V2); + + if (V2Index != 0) { + // If we have 4 or fewer lanes we can cheaply shuffle the element into + // the desired position. Otherwise it is more efficient to do a vector + // shift left. We know that we can do a vector shift left because all + // the inputs are zero. + if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) { + SmallVector<int, 4> V2Shuffle(Mask.size(), 1); + V2Shuffle[V2Index] = 0; + V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); + } else { + V2 = DAG.getBitcast(MVT::v2i64, V2); + V2 = DAG.getNode( + X86ISD::VSHLDQ, DL, MVT::v2i64, V2, + DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, + DAG.getTargetLoweringInfo().getScalarShiftAmountTy( + DAG.getDataLayout(), VT))); + V2 = DAG.getBitcast(VT, V2); + } + } + return V2; +} + +/// \brief Try to lower broadcast of a single - truncated - integer element, +/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements. +/// +/// This assumes we have AVX2. +static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0, + int BroadcastIdx, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(Subtarget->hasAVX2() && + "We can only lower integer broadcasts with AVX2!"); + + EVT EltVT = VT.getVectorElementType(); + EVT V0VT = V0.getValueType(); + + assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!"); + assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!"); + + EVT V0EltVT = V0VT.getVectorElementType(); + if (!V0EltVT.isInteger()) + return SDValue(); + + const unsigned EltSize = EltVT.getSizeInBits(); + const unsigned V0EltSize = V0EltVT.getSizeInBits(); + + // This is only a truncation if the original element type is larger. + if (V0EltSize <= EltSize) + return SDValue(); + + assert(((V0EltSize % EltSize) == 0) && + "Scalar type sizes must all be powers of 2 on x86!"); + + const unsigned V0Opc = V0.getOpcode(); + const unsigned Scale = V0EltSize / EltSize; + const unsigned V0BroadcastIdx = BroadcastIdx / Scale; + + if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) && + V0Opc != ISD::BUILD_VECTOR) + return SDValue(); + + SDValue Scalar = V0.getOperand(V0BroadcastIdx); + + // If we're extracting non-least-significant bits, shift so we can truncate. + // Hopefully, we can fold away the trunc/srl/load into the broadcast. + // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer + // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd. + if (const int OffsetIdx = BroadcastIdx % Scale) + Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar, + DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType())); + + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, + DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar)); +} + +/// \brief Try to lower broadcast of a single element. +/// +/// For convenience, this code also bundles all of the subtarget feature set +/// filtering. While a little annoying to re-dispatch on type here, there isn't +/// a convenient way to factor it out. +/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them? +static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, + ArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + if (!Subtarget->hasAVX()) + return SDValue(); + if (VT.isInteger() && !Subtarget->hasAVX2()) + return SDValue(); + + // Check that the mask is a broadcast. + int BroadcastIdx = -1; + for (int M : Mask) + if (M >= 0 && BroadcastIdx == -1) + BroadcastIdx = M; + else if (M >= 0 && M != BroadcastIdx) + return SDValue(); + + assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " + "a sorted mask where the broadcast " + "comes from V1."); + + // Go up the chain of (vector) values to find a scalar load that we can + // combine with the broadcast. + for (;;) { + switch (V.getOpcode()) { + case ISD::CONCAT_VECTORS: { + int OperandSize = Mask.size() / V.getNumOperands(); + V = V.getOperand(BroadcastIdx / OperandSize); + BroadcastIdx %= OperandSize; + continue; + } + + case ISD::INSERT_SUBVECTOR: { + SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); + auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2)); + if (!ConstantIdx) + break; + + int BeginIdx = (int)ConstantIdx->getZExtValue(); + int EndIdx = + BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements(); + if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) { + BroadcastIdx -= BeginIdx; + V = VInner; + } else { + V = VOuter; + } + continue; + } + } + break; + } + + // Check if this is a broadcast of a scalar. We special case lowering + // for scalars so that we can more effectively fold with loads. + // First, look through bitcast: if the original value has a larger element + // type than the shuffle, the broadcast element is in essence truncated. + // Make that explicit to ease folding. + if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) + if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast( + DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG)) + return TruncBroadcast; + + // Also check the simpler case, where we can directly reuse the scalar. + if (V.getOpcode() == ISD::BUILD_VECTOR || + (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { + V = V.getOperand(BroadcastIdx); + + // If the scalar isn't a load, we can't broadcast from it in AVX1. + // Only AVX2 has register broadcasts. + if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) + return SDValue(); + } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) { + // If we are broadcasting a load that is only used by the shuffle + // then we can reduce the vector load to the broadcasted scalar load. + LoadSDNode *Ld = cast<LoadSDNode>(V); + SDValue BaseAddr = Ld->getOperand(1); + EVT AddrVT = BaseAddr.getValueType(); + EVT SVT = VT.getScalarType(); + unsigned Offset = BroadcastIdx * SVT.getStoreSize(); + SDValue NewAddr = DAG.getNode( + ISD::ADD, DL, AddrVT, BaseAddr, + DAG.getConstant(Offset, DL, AddrVT)); + V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, + DAG.getMachineFunction().getMachineMemOperand( + Ld->getMemOperand(), Offset, SVT.getStoreSize())); + } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { + // We can't broadcast from a vector register without AVX2, and we can only + // broadcast from the zero-element of a vector register. + return SDValue(); + } + + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V); +} + +// Check for whether we can use INSERTPS to perform the shuffle. We only use +// INSERTPS when the V1 elements are already in the correct locations +// because otherwise we can just always use two SHUFPS instructions which +// are much smaller to encode than a SHUFPS and an INSERTPS. We can also +// perform INSERTPS if a single V1 element is out of place and all V2 +// elements are zeroable. +static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + unsigned ZMask = 0; + int V1DstIndex = -1; + int V2DstIndex = -1; + bool V1UsedInPlace = false; + + for (int i = 0; i < 4; ++i) { + // Synthesize a zero mask from the zeroable elements (includes undefs). + if (Zeroable[i]) { + ZMask |= 1 << i; + continue; + } + + // Flag if we use any V1 inputs in place. + if (i == Mask[i]) { + V1UsedInPlace = true; + continue; + } + + // We can only insert a single non-zeroable element. + if (V1DstIndex != -1 || V2DstIndex != -1) + return SDValue(); + + if (Mask[i] < 4) { + // V1 input out of place for insertion. + V1DstIndex = i; + } else { + // V2 input for insertion. + V2DstIndex = i; + } + } + + // Don't bother if we have no (non-zeroable) element for insertion. + if (V1DstIndex == -1 && V2DstIndex == -1) + return SDValue(); + + // Determine element insertion src/dst indices. The src index is from the + // start of the inserted vector, not the start of the concatenated vector. + unsigned V2SrcIndex = 0; + if (V1DstIndex != -1) { + // If we have a V1 input out of place, we use V1 as the V2 element insertion + // and don't use the original V2 at all. + V2SrcIndex = Mask[V1DstIndex]; + V2DstIndex = V1DstIndex; + V2 = V1; + } else { + V2SrcIndex = Mask[V2DstIndex] - 4; + } + + // If no V1 inputs are used in place, then the result is created only from + // the zero mask and the V2 insertion - so remove V1 dependency. + if (!V1UsedInPlace) + V1 = DAG.getUNDEF(MVT::v4f32); + + unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask; + assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); + + // Insert the V2 element into the desired position. + SDLoc DL(Op); + return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, + DAG.getConstant(InsertPSMask, DL, MVT::i8)); +} + +/// \brief Try to lower a shuffle as a permute of the inputs followed by an +/// UNPCK instruction. +/// +/// This specifically targets cases where we end up with alternating between +/// the two inputs, and so can permute them into something that feeds a single +/// UNPCK instruction. Note that this routine only targets integer vectors +/// because for floating point vectors we have a generalized SHUFPS lowering +/// strategy that handles everything that doesn't *exactly* match an unpack, +/// making this clever lowering unnecessary. +static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(!VT.isFloatingPoint() && + "This routine only supports integer vectors."); + assert(!isSingleInputShuffleMask(Mask) && + "This routine should only be used when blending two inputs."); + assert(Mask.size() >= 2 && "Single element masks are invalid."); + + int Size = Mask.size(); + + int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) { + return M >= 0 && M % Size < Size / 2; + }); + int NumHiInputs = std::count_if( + Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; }); + + bool UnpackLo = NumLoInputs >= NumHiInputs; + + auto TryUnpack = [&](MVT UnpackVT, int Scale) { + SmallVector<int, 32> V1Mask(Mask.size(), -1); + SmallVector<int, 32> V2Mask(Mask.size(), -1); + + for (int i = 0; i < Size; ++i) { + if (Mask[i] < 0) + continue; + + // Each element of the unpack contains Scale elements from this mask. + int UnpackIdx = i / Scale; + + // We only handle the case where V1 feeds the first slots of the unpack. + // We rely on canonicalization to ensure this is the case. + if ((UnpackIdx % 2 == 0) != (Mask[i] < Size)) + return SDValue(); + + // Setup the mask for this input. The indexing is tricky as we have to + // handle the unpack stride. + SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask; + VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] = + Mask[i] % Size; + } + + // If we will have to shuffle both inputs to use the unpack, check whether + // we can just unpack first and shuffle the result. If so, skip this unpack. + if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) && + !isNoopShuffleMask(V2Mask)) + return SDValue(); + + // Shuffle the inputs into place. + V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); + V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); + + // Cast the inputs to the type we will use to unpack them. + V1 = DAG.getBitcast(UnpackVT, V1); + V2 = DAG.getBitcast(UnpackVT, V2); + + // Unpack the inputs and cast the result back to the desired type. + return DAG.getBitcast( + VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, + UnpackVT, V1, V2)); + }; + + // We try each unpack from the largest to the smallest to try and find one + // that fits this mask. + int OrigNumElements = VT.getVectorNumElements(); + int OrigScalarSize = VT.getScalarSizeInBits(); + for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) { + int Scale = ScalarSize / OrigScalarSize; + int NumElements = OrigNumElements / Scale; + MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements); + if (SDValue Unpack = TryUnpack(UnpackVT, Scale)) + return Unpack; + } + + // If none of the unpack-rooted lowerings worked (or were profitable) try an + // initial unpack. + if (NumLoInputs == 0 || NumHiInputs == 0) { + assert((NumLoInputs > 0 || NumHiInputs > 0) && + "We have to have *some* inputs!"); + int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0; + + // FIXME: We could consider the total complexity of the permute of each + // possible unpacking. Or at the least we should consider how many + // half-crossings are created. + // FIXME: We could consider commuting the unpacks. + + SmallVector<int, 32> PermMask; + PermMask.assign(Size, -1); + for (int i = 0; i < Size; ++i) { + if (Mask[i] < 0) + continue; + + assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!"); + + PermMask[i] = + 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1); + } + return DAG.getVectorShuffle( + VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, + DL, VT, V1, V2), + DAG.getUNDEF(VT), PermMask); + } + + return SDValue(); +} + +/// \brief Handle lowering of 2-lane 64-bit floating point shuffles. +/// +/// This is the basis function for the 2-lane 64-bit shuffles as we have full +/// support for floating point shuffles but not integer shuffles. These +/// instructions will incur a domain crossing penalty on some chips though so +/// it is better to avoid lowering through this for integer vectors where +/// possible. +static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); + + if (isSingleInputShuffleMask(Mask)) { + // Use low duplicate instructions for masks that match their pattern. + if (Subtarget->hasSSE3()) + if (isShuffleEquivalent(V1, V2, Mask, {0, 0})) + return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1); + + // Straight shuffle of a single input vector. Simulate this by using the + // single input as both of the "inputs" to this instruction.. + unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); + + if (Subtarget->hasAVX()) { + // If we have AVX, we can use VPERMILPS which will allow folding a load + // into the shuffle. + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, + DAG.getConstant(SHUFPDMask, DL, MVT::i8)); + } + + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1, + DAG.getConstant(SHUFPDMask, DL, MVT::i8)); + } + assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); + assert(Mask[1] >= 2 && "Non-canonicalized blend!"); + + // If we have a single input, insert that into V1 if we can do so cheaply. + if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + // Try inverting the insertion since for v2 masks it is easy to do and we + // can't reliably sort the mask one way or the other. + int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), + Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG)) + return Insertion; + } + + // Try to use one of the special instruction patterns to handle two common + // blend patterns if a zero-blend above didn't work. + if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) || + isShuffleEquivalent(V1, V2, Mask, {1, 3})) + if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) + // We can either use a special instruction to load over the low double or + // to move just the low double. + return DAG.getNode( + isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD, + DL, MVT::v2f64, V2, + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); + + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) + return V; + + unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2, + DAG.getConstant(SHUFPDMask, DL, MVT::i8)); +} + +/// \brief Handle lowering of 2-lane 64-bit integer shuffles. +/// +/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by +/// the integer unit to minimize domain crossing penalties. However, for blends +/// it falls back to the floating point shuffle operation with appropriate bit +/// casting. +static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); + + if (isSingleInputShuffleMask(Mask)) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // Straight shuffle of a single input vector. For everything from SSE2 + // onward this has a single fast instruction with no scary immediates. + // We have to map the mask as it is actually a v4i32 shuffle instruction. + V1 = DAG.getBitcast(MVT::v4i32, V1); + int WidenedMask[4] = { + std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, + std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; + return DAG.getBitcast( + MVT::v2i64, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, + getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG))); + } + assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"); + assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"); + assert(Mask[0] < 2 && "We sort V1 to be the first input."); + assert(Mask[1] >= 2 && "We sort V2 to be the second input."); + + // If we have a blend of two PACKUS operations an the blend aligns with the + // low and half halves, we can just merge the PACKUS operations. This is + // particularly important as it lets us merge shuffles that this routine itself + // creates. + auto GetPackNode = [](SDValue V) { + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + + return V.getOpcode() == X86ISD::PACKUS ? V : SDValue(); + }; + if (SDValue V1Pack = GetPackNode(V1)) + if (SDValue V2Pack = GetPackNode(V2)) + return DAG.getBitcast(MVT::v2i64, + DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, + Mask[0] == 0 ? V1Pack.getOperand(0) + : V1Pack.getOperand(1), + Mask[1] == 2 ? V2Pack.getOperand(0) + : V2Pack.getOperand(1))); + + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG)) + return Shift; + + // When loading a scalar and then shuffling it into a vector we can often do + // the insertion cheaply. + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + // Try inverting the insertion since for v2 masks it is easy to do and we + // can't reliably sort the mask one way or the other. + int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2}; + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG)) + return Insertion; + + // We have different paths for blend lowering, but they all must use the + // *exact* same predicate. + bool IsBlendSupported = Subtarget->hasSSE41(); + if (IsBlendSupported) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) + return V; + + // Try to use byte rotation instructions. + // Its more profitable for pre-SSSE3 to use shuffles/unpacks. + if (Subtarget->hasSSSE3()) + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + + // If we have direct support for blends, we should lower by decomposing into + // a permute. That will be faster than the domain cross. + if (IsBlendSupported) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, + Mask, DAG); + + // We implement this with SHUFPD which is pretty lame because it will likely + // incur 2 cycles of stall for integer vectors on Nehalem and older chips. + // However, all the alternatives are still more cycles and newer chips don't + // have this problem. It would be really nice if x86 had better shuffles here. + V1 = DAG.getBitcast(MVT::v2f64, V1); + V2 = DAG.getBitcast(MVT::v2f64, V2); + return DAG.getBitcast(MVT::v2i64, + DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); +} + +/// \brief Test whether this can be lowered with a single SHUFPS instruction. +/// +/// This is used to disable more specialized lowerings when the shufps lowering +/// will happen to be efficient. +static bool isSingleSHUFPSMask(ArrayRef<int> Mask) { + // This routine only handles 128-bit shufps. + assert(Mask.size() == 4 && "Unsupported mask size!"); + + // To lower with a single SHUFPS we need to have the low half and high half + // each requiring a single input. + if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4)) + return false; + if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4)) + return false; + + return true; +} + +/// \brief Lower a vector shuffle using the SHUFPS instruction. +/// +/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. +/// It makes no assumptions about whether this is the *best* lowering, it simply +/// uses it. +static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { + SDValue LowV = V1, HighV = V2; + int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; + + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + + if (NumV2Elements == 1) { + int V2Index = + std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - + Mask.begin(); + + // Compute the index adjacent to V2Index and in the same half by toggling + // the low bit. + int V2AdjIndex = V2Index ^ 1; + + if (Mask[V2AdjIndex] == -1) { + // Handles all the cases where we have a single V2 element and an undef. + // This will only ever happen in the high lanes because we commute the + // vector otherwise. + if (V2Index < 2) + std::swap(LowV, HighV); + NewMask[V2Index] -= 4; + } else { + // Handle the case where the V2 element ends up adjacent to a V1 element. + // To make this work, blend them together as the first step. + int V1Index = V2AdjIndex; + int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; + V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, + getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); + + // Now proceed to reconstruct the final blend as we have the necessary + // high or low half formed. + if (V2Index < 2) { + LowV = V2; + HighV = V1; + } else { + HighV = V2; + } + NewMask[V1Index] = 2; // We put the V1 element in V2[2]. + NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. + } + } else if (NumV2Elements == 2) { + if (Mask[0] < 4 && Mask[1] < 4) { + // Handle the easy case where we have V1 in the low lanes and V2 in the + // high lanes. + NewMask[2] -= 4; + NewMask[3] -= 4; + } else if (Mask[2] < 4 && Mask[3] < 4) { + // We also handle the reversed case because this utility may get called + // when we detect a SHUFPS pattern but can't easily commute the shuffle to + // arrange things in the right direction. + NewMask[0] -= 4; + NewMask[1] -= 4; + HighV = V1; + LowV = V2; + } else { + // We have a mixture of V1 and V2 in both low and high lanes. Rather than + // trying to place elements directly, just blend them and set up the final + // shuffle to place them. + + // The first two blend mask elements are for V1, the second two are for + // V2. + int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1], + Mask[2] < 4 ? Mask[2] : Mask[3], + (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, + (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; + V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, + getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); + + // Now we do a normal shuffle of V1 by giving V1 as both operands to + // a blend. + LowV = HighV = V1; + NewMask[0] = Mask[0] < 4 ? 0 : 2; + NewMask[1] = Mask[0] < 4 ? 2 : 0; + NewMask[2] = Mask[2] < 4 ? 1 : 3; + NewMask[3] = Mask[2] < 4 ? 3 : 1; + } + } + return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, + getV4X86ShuffleImm8ForMask(NewMask, DL, DAG)); +} + +/// \brief Lower 4-lane 32-bit floating point shuffles. +/// +/// Uses instructions exclusively from the floating point unit to minimize +/// domain crossing penalties, as these are sufficient to implement all v4f32 +/// shuffles. +static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + + if (NumV2Elements == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // Use even/odd duplicate instructions for masks that match their pattern. + if (Subtarget->hasSSE3()) { + if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) + return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1); + if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3})) + return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1); + } + + if (Subtarget->hasAVX()) { + // If we have AVX, we can use VPERMILPS which will allow folding a load + // into the shuffle. + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1, + getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); + } + + // Otherwise, use a straight shuffle of a single input vector. We pass the + // input vector to both operands to simulate this with a SHUFPS. + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, + getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); + } + + // There are special ways we can lower some single-element blends. However, we + // have custom ways we can lower more complex single-element blends below that + // we defer to if both this and BLENDPS fail to match, so restrict this to + // when the V2 input is targeting element 0 of the mask -- that is the fast + // case here. + if (NumV2Elements == 1 && Mask[0] >= 4) + if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2, + Mask, Subtarget, DAG)) + return V; + + if (Subtarget->hasSSE41()) { + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Use INSERTPS if we can complete the shuffle efficiently. + if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG)) + return V; + + if (!isSingleSHUFPSMask(Mask)) + if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute( + DL, MVT::v4f32, V1, V2, Mask, DAG)) + return BlendPerm; + } + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) + return V; + + // Otherwise fall back to a SHUFPS lowering strategy. + return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); +} + +/// \brief Lower 4-lane i32 vector shuffles. +/// +/// We try to handle these with integer-domain shuffles where we can, but for +/// blends we use the floating point domain blend instructions. +static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. It also allows us to fold memory operands into the + // shuffle in many cases. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) + return ZExt; + + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + + if (NumV2Elements == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // Straight shuffle of a single input vector. For everything from SSE2 + // onward this has a single fast instruction with no scary immediates. + // We coerce the shuffle pattern to be compatible with UNPCK instructions + // but we aren't actually going to use the UNPCK instruction because doing + // so prevents folding a load into this instruction or making a copy. + const int UnpackLoMask[] = {0, 0, 1, 1}; + const int UnpackHiMask[] = {2, 2, 3, 3}; + if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1})) + Mask = UnpackLoMask; + else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3})) + Mask = UnpackHiMask; + + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, + getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); + } + + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG)) + return Shift; + + // There are special ways we can lower some single-element blends. + if (NumV2Elements == 1) + if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) + return V; + + // We have different paths for blend lowering, but they all must use the + // *exact* same predicate. + bool IsBlendSupported = Subtarget->hasSSE41(); + if (IsBlendSupported) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + if (SDValue Masked = + lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG)) + return Masked; + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) + return V; + + // Try to use byte rotation instructions. + // Its more profitable for pre-SSSE3 to use shuffles/unpacks. + if (Subtarget->hasSSSE3()) + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + + // If we have direct support for blends, we should lower by decomposing into + // a permute. That will be faster than the domain cross. + if (IsBlendSupported) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, + Mask, DAG); + + // Try to lower by permuting the inputs into an unpack instruction. + if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, + V2, Mask, DAG)) + return Unpack; + + // We implement this with SHUFPS because it can blend from two vectors. + // Because we're going to eventually use SHUFPS, we use SHUFPS even to build + // up the inputs, bypassing domain shift penalties that we would encur if we + // directly used PSHUFD on Nehalem and older. For newer chips, this isn't + // relevant. + return DAG.getBitcast( + MVT::v4i32, + DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1), + DAG.getBitcast(MVT::v4f32, V2), Mask)); +} + +/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 +/// shuffle lowering, and the most complex part. +/// +/// The lowering strategy is to try to form pairs of input lanes which are +/// targeted at the same half of the final vector, and then use a dword shuffle +/// to place them onto the right half, and finally unpack the paired lanes into +/// their final position. +/// +/// The exact breakdown of how to form these dword pairs and align them on the +/// correct sides is really tricky. See the comments within the function for +/// more of the details. +/// +/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each +/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to +/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16 +/// vector, form the analogous 128-bit 8-element Mask. +static SDValue lowerV8I16GeneralSingleInputVectorShuffle( + SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); + MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); + + assert(Mask.size() == 8 && "Shuffle mask length doen't match!"); + MutableArrayRef<int> LoMask = Mask.slice(0, 4); + MutableArrayRef<int> HiMask = Mask.slice(4, 4); + + SmallVector<int, 4> LoInputs; + std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs), + [](int M) { return M >= 0; }); + std::sort(LoInputs.begin(), LoInputs.end()); + LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); + SmallVector<int, 4> HiInputs; + std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs), + [](int M) { return M >= 0; }); + std::sort(HiInputs.begin(), HiInputs.end()); + HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); + int NumLToL = + std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin(); + int NumHToL = LoInputs.size() - NumLToL; + int NumLToH = + std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin(); + int NumHToH = HiInputs.size() - NumLToH; + MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL); + MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH); + MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL); + MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH); + + // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all + // such inputs we can swap two of the dwords across the half mark and end up + // with <=2 inputs to each half in each half. Once there, we can fall through + // to the generic code below. For example: + // + // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] + // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] + // + // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half + // and an existing 2-into-2 on the other half. In this case we may have to + // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or + // 1-into-3 which could cause us to cycle endlessly fixing each side in turn. + // Fortunately, we don't have to handle anything but a 2-into-2 pattern + // because any other situation (including a 3-into-1 or 1-into-3 in the other + // half than the one we target for fixing) will be fixed when we re-enter this + // path. We will also combine away any sequence of PSHUFD instructions that + // result into a single instruction. Here is an example of the tricky case: + // + // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] + // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3] + // + // This now has a 1-into-3 in the high half! Instead, we do two shuffles: + // + // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h] + // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6] + // + // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h] + // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6] + // + // The result is fine to be handled by the generic logic. + auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs, + ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs, + int AOffset, int BOffset) { + assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) && + "Must call this with A having 3 or 1 inputs from the A half."); + assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) && + "Must call this with B having 1 or 3 inputs from the B half."); + assert(AToAInputs.size() + BToAInputs.size() == 4 && + "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); + + bool ThreeAInputs = AToAInputs.size() == 3; + + // Compute the index of dword with only one word among the three inputs in + // a half by taking the sum of the half with three inputs and subtracting + // the sum of the actual three inputs. The difference is the remaining + // slot. + int ADWord, BDWord; + int &TripleDWord = ThreeAInputs ? ADWord : BDWord; + int &OneInputDWord = ThreeAInputs ? BDWord : ADWord; + int TripleInputOffset = ThreeAInputs ? AOffset : BOffset; + ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs; + int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0]; + int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); + int TripleNonInputIdx = + TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); + TripleDWord = TripleNonInputIdx / 2; + + // We use xor with one to compute the adjacent DWord to whichever one the + // OneInput is in. + OneInputDWord = (OneInput / 2) ^ 1; + + // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA + // and BToA inputs. If there is also such a problem with the BToB and AToB + // inputs, we don't try to fix it necessarily -- we'll recurse and see it in + // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it + // is essential that we don't *create* a 3<-1 as then we might oscillate. + if (BToBInputs.size() == 2 && AToBInputs.size() == 2) { + // Compute how many inputs will be flipped by swapping these DWords. We + // need + // to balance this to ensure we don't form a 3-1 shuffle in the other + // half. + int NumFlippedAToBInputs = + std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) + + std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1); + int NumFlippedBToBInputs = + std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) + + std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1); + if ((NumFlippedAToBInputs == 1 && + (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) || + (NumFlippedBToBInputs == 1 && + (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) { + // We choose whether to fix the A half or B half based on whether that + // half has zero flipped inputs. At zero, we may not be able to fix it + // with that half. We also bias towards fixing the B half because that + // will more commonly be the high half, and we have to bias one way. + auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord, + ArrayRef<int> Inputs) { + int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot. + bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(), + PinnedIdx ^ 1) != Inputs.end(); + // Determine whether the free index is in the flipped dword or the + // unflipped dword based on where the pinned index is. We use this bit + // in an xor to conditionally select the adjacent dword. + int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord)); + bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), + FixFreeIdx) != Inputs.end(); + if (IsFixIdxInput == IsFixFreeIdxInput) + FixFreeIdx += 1; + IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), + FixFreeIdx) != Inputs.end(); + assert(IsFixIdxInput != IsFixFreeIdxInput && + "We need to be changing the number of flipped inputs!"); + int PSHUFHalfMask[] = {0, 1, 2, 3}; + std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]); + V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, + MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); + + for (int &M : Mask) + if (M != -1 && M == FixIdx) + M = FixFreeIdx; + else if (M != -1 && M == FixFreeIdx) + M = FixIdx; + }; + if (NumFlippedBToBInputs != 0) { + int BPinnedIdx = + BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; + FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); + } else { + assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); + int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput; + FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); + } + } + } + + int PSHUFDMask[] = {0, 1, 2, 3}; + PSHUFDMask[ADWord] = BDWord; + PSHUFDMask[BDWord] = ADWord; + V = DAG.getBitcast( + VT, + DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); + + // Adjust the mask to match the new locations of A and B. + for (int &M : Mask) + if (M != -1 && M/2 == ADWord) + M = 2 * BDWord + M % 2; + else if (M != -1 && M/2 == BDWord) + M = 2 * ADWord + M % 2; + + // Recurse back into this routine to re-compute state now that this isn't + // a 3 and 1 problem. + return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget, + DAG); + }; + if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) + return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); + else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) + return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); + + // At this point there are at most two inputs to the low and high halves from + // each half. That means the inputs can always be grouped into dwords and + // those dwords can then be moved to the correct half with a dword shuffle. + // We use at most one low and one high word shuffle to collect these paired + // inputs into dwords, and finally a dword shuffle to place them. + int PSHUFLMask[4] = {-1, -1, -1, -1}; + int PSHUFHMask[4] = {-1, -1, -1, -1}; + int PSHUFDMask[4] = {-1, -1, -1, -1}; + + // First fix the masks for all the inputs that are staying in their + // original halves. This will then dictate the targets of the cross-half + // shuffles. + auto fixInPlaceInputs = + [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs, + MutableArrayRef<int> SourceHalfMask, + MutableArrayRef<int> HalfMask, int HalfOffset) { + if (InPlaceInputs.empty()) + return; + if (InPlaceInputs.size() == 1) { + SourceHalfMask[InPlaceInputs[0] - HalfOffset] = + InPlaceInputs[0] - HalfOffset; + PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; + return; + } + if (IncomingInputs.empty()) { + // Just fix all of the in place inputs. + for (int Input : InPlaceInputs) { + SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; + PSHUFDMask[Input / 2] = Input / 2; + } + return; + } + + assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); + SourceHalfMask[InPlaceInputs[0] - HalfOffset] = + InPlaceInputs[0] - HalfOffset; + // Put the second input next to the first so that they are packed into + // a dword. We find the adjacent index by toggling the low bit. + int AdjIndex = InPlaceInputs[0] ^ 1; + SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; + std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); + PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; + }; + fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0); + fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4); + + // Now gather the cross-half inputs and place them into a free dword of + // their target half. + // FIXME: This operation could almost certainly be simplified dramatically to + // look more like the 3-1 fixing operation. + auto moveInputsToRightHalf = [&PSHUFDMask]( + MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs, + MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask, + MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset, + int DestOffset) { + auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) { + return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word; + }; + auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask, + int Word) { + int LowWord = Word & ~1; + int HighWord = Word | 1; + return isWordClobbered(SourceHalfMask, LowWord) || + isWordClobbered(SourceHalfMask, HighWord); + }; + + if (IncomingInputs.empty()) + return; + + if (ExistingInputs.empty()) { + // Map any dwords with inputs from them into the right half. + for (int Input : IncomingInputs) { + // If the source half mask maps over the inputs, turn those into + // swaps and use the swapped lane. + if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) { + if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) { + SourceHalfMask[SourceHalfMask[Input - SourceOffset]] = + Input - SourceOffset; + // We have to swap the uses in our half mask in one sweep. + for (int &M : HalfMask) + if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset) + M = Input; + else if (M == Input) + M = SourceHalfMask[Input - SourceOffset] + SourceOffset; + } else { + assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == + Input - SourceOffset && + "Previous placement doesn't match!"); + } + // Note that this correctly re-maps both when we do a swap and when + // we observe the other side of the swap above. We rely on that to + // avoid swapping the members of the input list directly. + Input = SourceHalfMask[Input - SourceOffset] + SourceOffset; + } + + // Map the input's dword into the correct half. + if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1) + PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2; + else + assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == + Input / 2 && + "Previous placement doesn't match!"); + } + + // And just directly shift any other-half mask elements to be same-half + // as we will have mirrored the dword containing the element into the + // same position within that half. + for (int &M : HalfMask) + if (M >= SourceOffset && M < SourceOffset + 4) { + M = M - SourceOffset + DestOffset; + assert(M >= 0 && "This should never wrap below zero!"); + } + return; + } + + // Ensure we have the input in a viable dword of its current half. This + // is particularly tricky because the original position may be clobbered + // by inputs being moved and *staying* in that half. + if (IncomingInputs.size() == 1) { + if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { + int InputFixed = std::find(std::begin(SourceHalfMask), + std::end(SourceHalfMask), -1) - + std::begin(SourceHalfMask) + SourceOffset; + SourceHalfMask[InputFixed - SourceOffset] = + IncomingInputs[0] - SourceOffset; + std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0], + InputFixed); + IncomingInputs[0] = InputFixed; + } + } else if (IncomingInputs.size() == 2) { + if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || + isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { + // We have two non-adjacent or clobbered inputs we need to extract from + // the source half. To do this, we need to map them into some adjacent + // dword slot in the source mask. + int InputsFixed[2] = {IncomingInputs[0] - SourceOffset, + IncomingInputs[1] - SourceOffset}; + + // If there is a free slot in the source half mask adjacent to one of + // the inputs, place the other input in it. We use (Index XOR 1) to + // compute an adjacent index. + if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) && + SourceHalfMask[InputsFixed[0] ^ 1] == -1) { + SourceHalfMask[InputsFixed[0]] = InputsFixed[0]; + SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; + InputsFixed[1] = InputsFixed[0] ^ 1; + } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) && + SourceHalfMask[InputsFixed[1] ^ 1] == -1) { + SourceHalfMask[InputsFixed[1]] = InputsFixed[1]; + SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0]; + InputsFixed[0] = InputsFixed[1] ^ 1; + } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 && + SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) { + // The two inputs are in the same DWord but it is clobbered and the + // adjacent DWord isn't used at all. Move both inputs to the free + // slot. + SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0]; + SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1]; + InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1); + InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1; + } else { + // The only way we hit this point is if there is no clobbering + // (because there are no off-half inputs to this half) and there is no + // free slot adjacent to one of the inputs. In this case, we have to + // swap an input with a non-input. + for (int i = 0; i < 4; ++i) + assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) && + "We can't handle any clobbers here!"); + assert(InputsFixed[1] != (InputsFixed[0] ^ 1) && + "Cannot have adjacent inputs here!"); + + SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; + SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1; + + // We also have to update the final source mask in this case because + // it may need to undo the above swap. + for (int &M : FinalSourceHalfMask) + if (M == (InputsFixed[0] ^ 1) + SourceOffset) + M = InputsFixed[1] + SourceOffset; + else if (M == InputsFixed[1] + SourceOffset) + M = (InputsFixed[0] ^ 1) + SourceOffset; + + InputsFixed[1] = InputsFixed[0] ^ 1; + } + + // Point everything at the fixed inputs. + for (int &M : HalfMask) + if (M == IncomingInputs[0]) + M = InputsFixed[0] + SourceOffset; + else if (M == IncomingInputs[1]) + M = InputsFixed[1] + SourceOffset; + + IncomingInputs[0] = InputsFixed[0] + SourceOffset; + IncomingInputs[1] = InputsFixed[1] + SourceOffset; + } + } else { + llvm_unreachable("Unhandled input size!"); + } + + // Now hoist the DWord down to the right half. + int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2; + assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free"); + PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; + for (int &M : HalfMask) + for (int Input : IncomingInputs) + if (M == Input) + M = FreeDWord * 2 + Input % 2; + }; + moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask, + /*SourceOffset*/ 4, /*DestOffset*/ 0); + moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask, + /*SourceOffset*/ 0, /*DestOffset*/ 4); + + // Now enact all the shuffles we've computed to move the inputs into their + // target half. + if (!isNoopShuffleMask(PSHUFLMask)) + V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, + getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG)); + if (!isNoopShuffleMask(PSHUFHMask)) + V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, + getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG)); + if (!isNoopShuffleMask(PSHUFDMask)) + V = DAG.getBitcast( + VT, + DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); + + // At this point, each half should contain all its inputs, and we can then + // just shuffle them into their final position. + assert(std::count_if(LoMask.begin(), LoMask.end(), + [](int M) { return M >= 4; }) == 0 && + "Failed to lift all the high half inputs to the low mask!"); + assert(std::count_if(HiMask.begin(), HiMask.end(), + [](int M) { return M >= 0 && M < 4; }) == 0 && + "Failed to lift all the low half inputs to the high mask!"); + + // Do a half shuffle for the low mask. + if (!isNoopShuffleMask(LoMask)) + V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, + getV4X86ShuffleImm8ForMask(LoMask, DL, DAG)); + + // Do a half shuffle with the high mask after shifting its values down. + for (int &M : HiMask) + if (M >= 0) + M -= 4; + if (!isNoopShuffleMask(HiMask)) + V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, + getV4X86ShuffleImm8ForMask(HiMask, DL, DAG)); + + return V; +} + +/// \brief Helper to form a PSHUFB-based shuffle+blend. +static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG, bool &V1InUse, + bool &V2InUse) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + SDValue V1Mask[16]; + SDValue V2Mask[16]; + V1InUse = false; + V2InUse = false; + + int Size = Mask.size(); + int Scale = 16 / Size; + for (int i = 0; i < 16; ++i) { + if (Mask[i / Scale] == -1) { + V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8); + } else { + const int ZeroMask = 0x80; + int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale + : ZeroMask; + int V2Idx = Mask[i / Scale] < Size + ? ZeroMask + : (Mask[i / Scale] - Size) * Scale + i % Scale; + if (Zeroable[i / Scale]) + V1Idx = V2Idx = ZeroMask; + V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8); + V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8); + V1InUse |= (ZeroMask != V1Idx); + V2InUse |= (ZeroMask != V2Idx); + } + } + + if (V1InUse) + V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, + DAG.getBitcast(MVT::v16i8, V1), + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); + if (V2InUse) + V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, + DAG.getBitcast(MVT::v16i8, V2), + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); + + // If we need shuffled inputs from both, blend the two. + SDValue V; + if (V1InUse && V2InUse) + V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); + else + V = V1InUse ? V1 : V2; + + // Cast the result back to the correct type. + return DAG.getBitcast(VT, V); +} + +/// \brief Generic lowering of 8-lane i16 shuffles. +/// +/// This handles both single-input shuffles and combined shuffle/blends with +/// two inputs. The single input shuffles are immediately delegated to +/// a dedicated lowering routine. +/// +/// The blends are lowered in one of three fundamental ways. If there are few +/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle +/// of the input is significantly cheaper when lowered as an interleaving of +/// the two inputs, try to interleave them. Otherwise, blend the low and high +/// halves of the inputs separately (making them have relatively few inputs) +/// and then concatenate them. +static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> OrigMask = SVOp->getMask(); + int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], + OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]}; + MutableArrayRef<int> Mask(MaskStorage); + + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG)) + return ZExt; + + auto isV1 = [](int M) { return M >= 0 && M < 8; }; + (void)isV1; + auto isV2 = [](int M) { return M >= 8; }; + + int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2); + + if (NumV2Inputs == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG)) + return Shift; + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) + return V; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, + Mask, Subtarget, DAG)) + return Rotate; + + return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask, + Subtarget, DAG); + } + + assert(std::any_of(Mask.begin(), Mask.end(), isV1) && + "All single-input shuffles should be canonicalized to be V1-input " + "shuffles."); + + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG)) + return Shift; + + // See if we can use SSE4A Extraction / Insertion. + if (Subtarget->hasSSE4A()) + if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG)) + return V; + + // There are special ways we can lower some single-element blends. + if (NumV2Inputs == 1) + if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2, + Mask, Subtarget, DAG)) + return V; + + // We have different paths for blend lowering, but they all must use the + // *exact* same predicate. + bool IsBlendSupported = Subtarget->hasSSE41(); + if (IsBlendSupported) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + if (SDValue Masked = + lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG)) + return Masked; + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) + return V; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + + if (SDValue BitBlend = + lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) + return BitBlend; + + if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, + V2, Mask, DAG)) + return Unpack; + + // If we can't directly blend but can use PSHUFB, that will be better as it + // can both shuffle and set up the inefficient blend. + if (!IsBlendSupported && Subtarget->hasSSSE3()) { + bool V1InUse, V2InUse; + return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG, + V1InUse, V2InUse); + } + + // We can always bit-blend if we have to so the fallback strategy is to + // decompose into single-input permutes and blends. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, + Mask, DAG); +} + +/// \brief Check whether a compaction lowering can be done by dropping even +/// elements and compute how many times even elements must be dropped. +/// +/// This handles shuffles which take every Nth element where N is a power of +/// two. Example shuffle masks: +/// +/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 +/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 +/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 +/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 +/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 +/// +/// Any of these lanes can of course be undef. +/// +/// This routine only supports N <= 3. +/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here +/// for larger N. +/// +/// \returns N above, or the number of times even elements must be dropped if +/// there is such a number. Otherwise returns zero. +static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) { + // Figure out whether we're looping over two inputs or just one. + bool IsSingleInput = isSingleInputShuffleMask(Mask); + + // The modulus for the shuffle vector entries is based on whether this is + // a single input or not. + int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); + assert(isPowerOf2_32((uint32_t)ShuffleModulus) && + "We should only be called with masks with a power-of-2 size!"); + + uint64_t ModMask = (uint64_t)ShuffleModulus - 1; + + // We track whether the input is viable for all power-of-2 strides 2^1, 2^2, + // and 2^3 simultaneously. This is because we may have ambiguity with + // partially undef inputs. + bool ViableForN[3] = {true, true, true}; + + for (int i = 0, e = Mask.size(); i < e; ++i) { + // Ignore undef lanes, we'll optimistically collapse them to the pattern we + // want. + if (Mask[i] == -1) + continue; + + bool IsAnyViable = false; + for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) + if (ViableForN[j]) { + uint64_t N = j + 1; + + // The shuffle mask must be equal to (i * 2^N) % M. + if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask)) + IsAnyViable = true; + else + ViableForN[j] = false; + } + // Early exit if we exhaust the possible powers of two. + if (!IsAnyViable) + break; + } + + for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) + if (ViableForN[j]) + return j + 1; + + // Return 0 as there is no viable power of two. + return 0; +} + +/// \brief Generic lowering of v16i8 shuffles. +/// +/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to +/// detect any complexity reducing interleaving. If that doesn't help, it uses +/// UNPCK to spread the i8 elements across two i16-element vectors, and uses +/// the existing lowering for v8i16 blends on each half, finally PACK-ing them +/// back together. +static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return Shift; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + + // Try to use a zext lowering. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) + return ZExt; + + // See if we can use SSE4A Extraction / Insertion. + if (Subtarget->hasSSE4A()) + if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return V; + + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; }); + + // For single-input shuffles, there are some nicer lowering tricks we can use. + if (NumV2Elements == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // Check whether we can widen this to an i16 shuffle by duplicating bytes. + // Notably, this handles splat and partial-splat shuffles more efficiently. + // However, it only makes sense if the pre-duplication shuffle simplifies + // things significantly. Currently, this means we need to be able to + // express the pre-duplication shuffle as an i16 shuffle. + // + // FIXME: We should check for other patterns which can be widened into an + // i16 shuffle as well. + auto canWidenViaDuplication = [](ArrayRef<int> Mask) { + for (int i = 0; i < 16; i += 2) + if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1]) + return false; + + return true; + }; + auto tryToWidenViaDuplication = [&]() -> SDValue { + if (!canWidenViaDuplication(Mask)) + return SDValue(); + SmallVector<int, 4> LoInputs; + std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs), + [](int M) { return M >= 0 && M < 8; }); + std::sort(LoInputs.begin(), LoInputs.end()); + LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), + LoInputs.end()); + SmallVector<int, 4> HiInputs; + std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs), + [](int M) { return M >= 8; }); + std::sort(HiInputs.begin(), HiInputs.end()); + HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), + HiInputs.end()); + + bool TargetLo = LoInputs.size() >= HiInputs.size(); + ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs; + ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs; + + int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; + SmallDenseMap<int, int, 8> LaneMap; + for (int I : InPlaceInputs) { + PreDupI16Shuffle[I/2] = I/2; + LaneMap[I] = I; + } + int j = TargetLo ? 0 : 4, je = j + 4; + for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) { + // Check if j is already a shuffle of this input. This happens when + // there are two adjacent bytes after we move the low one. + if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { + // If we haven't yet mapped the input, search for a slot into which + // we can map it. + while (j < je && PreDupI16Shuffle[j] != -1) + ++j; + + if (j == je) + // We can't place the inputs into a single half with a simple i16 shuffle, so bail. + return SDValue(); + + // Map this input with the i16 shuffle. + PreDupI16Shuffle[j] = MovingInputs[i] / 2; + } + + // Update the lane map based on the mapping we ended up with. + LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; + } + V1 = DAG.getBitcast( + MVT::v16i8, + DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), + DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); + + // Unpack the bytes to form the i16s that will be shuffled into place. + V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, + MVT::v16i8, V1, V1); + + int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + for (int i = 0; i < 16; ++i) + if (Mask[i] != -1) { + int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); + assert(MappedMask < 8 && "Invalid v8 shuffle mask!"); + if (PostDupI16Shuffle[i / 2] == -1) + PostDupI16Shuffle[i / 2] = MappedMask; + else + assert(PostDupI16Shuffle[i / 2] == MappedMask && + "Conflicting entrties in the original shuffle!"); + } + return DAG.getBitcast( + MVT::v16i8, + DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), + DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); + }; + if (SDValue V = tryToWidenViaDuplication()) + return V; + } + + if (SDValue Masked = + lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return Masked; + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) + return V; + + // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly + // with PSHUFB. It is important to do this before we attempt to generate any + // blends but after all of the single-input lowerings. If the single input + // lowerings can find an instruction sequence that is faster than a PSHUFB, we + // want to preserve that and we can DAG combine any longer sequences into + // a PSHUFB in the end. But once we start blending from multiple inputs, + // the complexity of DAG combining bad patterns back into PSHUFB is too high, + // and there are *very* few patterns that would actually be faster than the + // PSHUFB approach because of its ability to zero lanes. + // + // FIXME: The only exceptions to the above are blends which are exact + // interleavings with direct instructions supporting them. We currently don't + // handle those well here. + if (Subtarget->hasSSSE3()) { + bool V1InUse = false; + bool V2InUse = false; + + SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask, + DAG, V1InUse, V2InUse); + + // If both V1 and V2 are in use and we can use a direct blend or an unpack, + // do so. This avoids using them to handle blends-with-zero which is + // important as a single pshufb is significantly faster for that. + if (V1InUse && V2InUse) { + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2, + Mask, Subtarget, DAG)) + return Blend; + + // We can use an unpack to do the blending rather than an or in some + // cases. Even though the or may be (very minorly) more efficient, we + // preference this lowering because there are common cases where part of + // the complexity of the shuffles goes away when we do the final blend as + // an unpack. + // FIXME: It might be worth trying to detect if the unpack-feeding + // shuffles will both be pshufb, in which case we shouldn't bother with + // this. + if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( + DL, MVT::v16i8, V1, V2, Mask, DAG)) + return Unpack; + } + + return PSHUFB; + } + + // There are special ways we can lower some single-element blends. + if (NumV2Elements == 1) + if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2, + Mask, Subtarget, DAG)) + return V; + + if (SDValue BitBlend = + lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return BitBlend; + + // Check whether a compaction lowering can be done. This handles shuffles + // which take every Nth element for some even N. See the helper function for + // details. + // + // We special case these as they can be particularly efficiently handled with + // the PACKUSB instruction on x86 and they show up in common patterns of + // rearranging bytes to truncate wide elements. + if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) { + // NumEvenDrops is the power of two stride of the elements. Another way of + // thinking about it is that we need to drop the even elements this many + // times to get the original input. + bool IsSingleInput = isSingleInputShuffleMask(Mask); + + // First we need to zero all the dropped bytes. + assert(NumEvenDrops <= 3 && + "No support for dropping even elements more than 3 times."); + // We use the mask type to pick which bytes are preserved based on how many + // elements are dropped. + MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 }; + SDValue ByteClearMask = DAG.getBitcast( + MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1])); + V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask); + if (!IsSingleInput) + V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask); + + // Now pack things back together. + V1 = DAG.getBitcast(MVT::v8i16, V1); + V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2); + SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2); + for (int i = 1; i < NumEvenDrops; ++i) { + Result = DAG.getBitcast(MVT::v8i16, Result); + Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result); + } + + return Result; + } + + // Handle multi-input cases by blending single-input shuffles. + if (NumV2Elements > 0) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, + Mask, DAG); + + // The fallback path for single-input shuffles widens this into two v8i16 + // vectors with unpacks, shuffles those, and then pulls them back together + // with a pack. + SDValue V = V1; + + int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + for (int i = 0; i < 16; ++i) + if (Mask[i] >= 0) + (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i]; + + SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); + + SDValue VLoHalf, VHiHalf; + // Check if any of the odd lanes in the v16i8 are used. If not, we can mask + // them out and avoid using UNPCK{L,H} to extract the elements of V as + // i16s. + if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask), + [](int M) { return M >= 0 && M % 2 == 1; }) && + std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask), + [](int M) { return M >= 0 && M % 2 == 1; })) { + // Use a mask to drop the high bytes. + VLoHalf = DAG.getBitcast(MVT::v8i16, V); + VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf, + DAG.getConstant(0x00FF, DL, MVT::v8i16)); + + // This will be a single vector shuffle instead of a blend so nuke VHiHalf. + VHiHalf = DAG.getUNDEF(MVT::v8i16); + + // Squash the masks to point directly into VLoHalf. + for (int &M : LoBlendMask) + if (M >= 0) + M /= 2; + for (int &M : HiBlendMask) + if (M >= 0) + M /= 2; + } else { + // Otherwise just unpack the low half of V into VLoHalf and the high half into + // VHiHalf so that we can blend them as i16s. + VLoHalf = DAG.getBitcast( + MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); + VHiHalf = DAG.getBitcast( + MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); + } + + SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask); + SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask); + + return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); +} + +/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles. +/// +/// This routine breaks down the specific type of 128-bit shuffle and +/// dispatches to the lowering routines accordingly. +static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + switch (VT.SimpleTy) { + case MVT::v2i64: + return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v2f64: + return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v4i32: + return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v4f32: + return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8i16: + return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v16i8: + return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG); + + default: + llvm_unreachable("Unimplemented!"); + } +} + +/// \brief Helper function to test whether a shuffle mask could be +/// simplified by widening the elements being shuffled. +/// +/// Appends the mask for wider elements in WidenedMask if valid. Otherwise +/// leaves it in an unspecified state. +/// +/// NOTE: This must handle normal vector shuffle masks and *target* vector +/// shuffle masks. The latter have the special property of a '-2' representing +/// a zero-ed lane of a vector. +static bool canWidenShuffleElements(ArrayRef<int> Mask, + SmallVectorImpl<int> &WidenedMask) { + for (int i = 0, Size = Mask.size(); i < Size; i += 2) { + // If both elements are undef, its trivial. + if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) { + WidenedMask.push_back(SM_SentinelUndef); + continue; + } + + // Check for an undef mask and a mask value properly aligned to fit with + // a pair of values. If we find such a case, use the non-undef mask's value. + if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) { + WidenedMask.push_back(Mask[i + 1] / 2); + continue; + } + if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) { + WidenedMask.push_back(Mask[i] / 2); + continue; + } + + // When zeroing, we need to spread the zeroing across both lanes to widen. + if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) { + if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) && + (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) { + WidenedMask.push_back(SM_SentinelZero); + continue; + } + return false; + } + + // Finally check if the two mask values are adjacent and aligned with + // a pair. + if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) { + WidenedMask.push_back(Mask[i] / 2); + continue; + } + + // Otherwise we can't safely widen the elements used in this shuffle. + return false; + } + assert(WidenedMask.size() == Mask.size() / 2 && + "Incorrect size of mask after widening the elements!"); + + return true; +} + +/// \brief Generic routine to split vector shuffle into half-sized shuffles. +/// +/// This routine just extracts two subvectors, shuffles them independently, and +/// then concatenates them back together. This should work effectively with all +/// AVX vector shuffle types. +static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(VT.getSizeInBits() >= 256 && + "Only for 256-bit or wider vector shuffles!"); + assert(V1.getSimpleValueType() == VT && "Bad operand type!"); + assert(V2.getSimpleValueType() == VT && "Bad operand type!"); + + ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2); + ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2); + + int NumElements = VT.getVectorNumElements(); + int SplitNumElements = NumElements / 2; + MVT ScalarVT = VT.getVectorElementType(); + MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); + + // Rather than splitting build-vectors, just build two narrower build + // vectors. This helps shuffling with splats and zeros. + auto SplitVector = [&](SDValue V) { + while (V.getOpcode() == ISD::BITCAST) + V = V->getOperand(0); + + MVT OrigVT = V.getSimpleValueType(); + int OrigNumElements = OrigVT.getVectorNumElements(); + int OrigSplitNumElements = OrigNumElements / 2; + MVT OrigScalarVT = OrigVT.getVectorElementType(); + MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2); + + SDValue LoV, HiV; + + auto *BV = dyn_cast<BuildVectorSDNode>(V); + if (!BV) { + LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, + DAG.getIntPtrConstant(0, DL)); + HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, + DAG.getIntPtrConstant(OrigSplitNumElements, DL)); + } else { + + SmallVector<SDValue, 16> LoOps, HiOps; + for (int i = 0; i < OrigSplitNumElements; ++i) { + LoOps.push_back(BV->getOperand(i)); + HiOps.push_back(BV->getOperand(i + OrigSplitNumElements)); + } + LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps); + HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps); + } + return std::make_pair(DAG.getBitcast(SplitVT, LoV), + DAG.getBitcast(SplitVT, HiV)); + }; + + SDValue LoV1, HiV1, LoV2, HiV2; + std::tie(LoV1, HiV1) = SplitVector(V1); + std::tie(LoV2, HiV2) = SplitVector(V2); + + // Now create two 4-way blends of these half-width vectors. + auto HalfBlend = [&](ArrayRef<int> HalfMask) { + bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false; + SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask; + for (int i = 0; i < SplitNumElements; ++i) { + int M = HalfMask[i]; + if (M >= NumElements) { + if (M >= NumElements + SplitNumElements) + UseHiV2 = true; + else + UseLoV2 = true; + V2BlendMask.push_back(M - NumElements); + V1BlendMask.push_back(-1); + BlendMask.push_back(SplitNumElements + i); + } else if (M >= 0) { + if (M >= SplitNumElements) + UseHiV1 = true; + else + UseLoV1 = true; + V2BlendMask.push_back(-1); + V1BlendMask.push_back(M); + BlendMask.push_back(i); + } else { + V2BlendMask.push_back(-1); + V1BlendMask.push_back(-1); + BlendMask.push_back(-1); + } + } + + // Because the lowering happens after all combining takes place, we need to + // manually combine these blend masks as much as possible so that we create + // a minimal number of high-level vector shuffle nodes. + + // First try just blending the halves of V1 or V2. + if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2) + return DAG.getUNDEF(SplitVT); + if (!UseLoV2 && !UseHiV2) + return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); + if (!UseLoV1 && !UseHiV1) + return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); + + SDValue V1Blend, V2Blend; + if (UseLoV1 && UseHiV1) { + V1Blend = + DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); + } else { + // We only use half of V1 so map the usage down into the final blend mask. + V1Blend = UseLoV1 ? LoV1 : HiV1; + for (int i = 0; i < SplitNumElements; ++i) + if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements) + BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements); + } + if (UseLoV2 && UseHiV2) { + V2Blend = + DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); + } else { + // We only use half of V2 so map the usage down into the final blend mask. + V2Blend = UseLoV2 ? LoV2 : HiV2; + for (int i = 0; i < SplitNumElements; ++i) + if (BlendMask[i] >= SplitNumElements) + BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0); + } + return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask); + }; + SDValue Lo = HalfBlend(LoMask); + SDValue Hi = HalfBlend(HiMask); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); +} + +/// \brief Either split a vector in halves or decompose the shuffles and the +/// blend. +/// +/// This is provided as a good fallback for many lowerings of non-single-input +/// shuffles with more than one 128-bit lane. In those cases, we want to select +/// between splitting the shuffle into 128-bit components and stitching those +/// back together vs. extracting the single-input shuffles and blending those +/// results. +static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to " + "lower single-input shuffles as it " + "could then recurse on itself."); + int Size = Mask.size(); + + // If this can be modeled as a broadcast of two elements followed by a blend, + // prefer that lowering. This is especially important because broadcasts can + // often fold with memory operands. + auto DoBothBroadcast = [&] { + int V1BroadcastIdx = -1, V2BroadcastIdx = -1; + for (int M : Mask) + if (M >= Size) { + if (V2BroadcastIdx == -1) + V2BroadcastIdx = M - Size; + else if (M - Size != V2BroadcastIdx) + return false; + } else if (M >= 0) { + if (V1BroadcastIdx == -1) + V1BroadcastIdx = M; + else if (M != V1BroadcastIdx) + return false; + } + return true; + }; + if (DoBothBroadcast()) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, + DAG); + + // If the inputs all stem from a single 128-bit lane of each input, then we + // split them rather than blending because the split will decompose to + // unusually few instructions. + int LaneCount = VT.getSizeInBits() / 128; + int LaneSize = Size / LaneCount; + SmallBitVector LaneInputs[2]; + LaneInputs[0].resize(LaneCount, false); + LaneInputs[1].resize(LaneCount, false); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0) + LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true; + if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + + // Otherwise, just fall back to decomposed shuffles and a blend. This requires + // that the decomposed single-input shuffles don't end up here. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); +} + +/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as +/// a permutation and blend of those lanes. +/// +/// This essentially blends the out-of-lane inputs to each lane into the lane +/// from a permuted copy of the vector. This lowering strategy results in four +/// instructions in the worst case for a single-input cross lane shuffle which +/// is lower than any other fully general cross-lane shuffle strategy I'm aware +/// of. Special cases for each particular shuffle pattern should be handled +/// prior to trying this lowering. +static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + // FIXME: This should probably be generalized for 512-bit vectors as well. + assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!"); + int LaneSize = Mask.size() / 2; + + // If there are only inputs from one 128-bit lane, splitting will in fact be + // less expensive. The flags track whether the given lane contains an element + // that crosses to another lane. + bool LaneCrossing[2] = {false, false}; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) + LaneCrossing[(Mask[i] % Size) / LaneSize] = true; + if (!LaneCrossing[0] || !LaneCrossing[1]) + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + + if (isSingleInputShuffleMask(Mask)) { + SmallVector<int, 32> FlippedBlendMask; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + FlippedBlendMask.push_back( + Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize) + ? Mask[i] + : Mask[i] % LaneSize + + (i / LaneSize) * LaneSize + Size)); + + // Flip the vector, and blend the results which should now be in-lane. The + // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and + // 5 for the high source. The value 3 selects the high half of source 2 and + // the value 2 selects the low half of source 2. We only use source 2 to + // allow folding it into a memory operand. + unsigned PERMMask = 3 | 2 << 4; + SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT), + V1, DAG.getConstant(PERMMask, DL, MVT::i8)); + return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); + } + + // This now reduces to two single-input shuffles of V1 and V2 which at worst + // will be handled by the above logic and a blend of the results, much like + // other patterns in AVX. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering 2-lane 128-bit shuffles. +static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + // TODO: If minimizing size and one of the inputs is a zero vector and the + // the zero vector has only one use, we could use a VPERM2X128 to save the + // instruction bytes needed to explicitly generate the zero vector. + + // Blends are faster and handle all the non-lane-crossing cases. + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode()); + + // If either input operand is a zero vector, use VPERM2X128 because its mask + // allows us to replace the zero input with an implicit zero. + if (!IsV1Zero && !IsV2Zero) { + // Check for patterns which can be matched with a single insert of a 128-bit + // subvector. + bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}); + if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { + MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() / 2); + SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, + DAG.getIntPtrConstant(0, DL)); + SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, + OnlyUsesV1 ? V1 : V2, + DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + } + } + + // Otherwise form a 128-bit permutation. After accounting for undefs, + // convert the 64-bit shuffle mask selection values into 128-bit + // selection bits by dividing the indexes by 2 and shifting into positions + // defined by a vperm2*128 instruction's immediate control byte. + + // The immediate permute control byte looks like this: + // [1:0] - select 128 bits from sources for low half of destination + // [2] - ignore + // [3] - zero low half of destination + // [5:4] - select 128 bits from sources for high half of destination + // [6] - ignore + // [7] - zero high half of destination + + int MaskLO = Mask[0]; + if (MaskLO == SM_SentinelUndef) + MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1]; + + int MaskHI = Mask[2]; + if (MaskHI == SM_SentinelUndef) + MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3]; + + unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4; + + // If either input is a zero vector, replace it with an undef input. + // Shuffle mask values < 4 are selecting elements of V1. + // Shuffle mask values >= 4 are selecting elements of V2. + // Adjust each half of the permute mask by clearing the half that was + // selecting the zero vector and setting the zero mask bit. + if (IsV1Zero) { + V1 = DAG.getUNDEF(VT); + if (MaskLO < 4) + PermMask = (PermMask & 0xf0) | 0x08; + if (MaskHI < 4) + PermMask = (PermMask & 0x0f) | 0x80; + } + if (IsV2Zero) { + V2 = DAG.getUNDEF(VT); + if (MaskLO >= 4) + PermMask = (PermMask & 0xf0) | 0x08; + if (MaskHI >= 4) + PermMask = (PermMask & 0x0f) | 0x80; + } + + return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, + DAG.getConstant(PermMask, DL, MVT::i8)); +} + +/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then +/// shuffling each lane. +/// +/// This will only succeed when the result of fixing the 128-bit lanes results +/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in +/// each 128-bit lanes. This handles many cases where we can quickly blend away +/// the lane crosses early and then use simpler shuffles within each lane. +/// +/// FIXME: It might be worthwhile at some point to support this without +/// requiring the 128-bit lane-relative shuffles to be repeating, but currently +/// in x86 only floating point has interesting non-repeating shuffles, and even +/// those are still *marginally* more expensive. +static SDValue lowerVectorShuffleByMerging128BitLanes( + SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + assert(!isSingleInputShuffleMask(Mask) && + "This is only useful with multiple inputs."); + + int Size = Mask.size(); + int LaneSize = 128 / VT.getScalarSizeInBits(); + int NumLanes = Size / LaneSize; + assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles."); + + // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also + // check whether the in-128-bit lane shuffles share a repeating pattern. + SmallVector<int, 4> Lanes; + Lanes.resize(NumLanes, -1); + SmallVector<int, 4> InLaneMask; + InLaneMask.resize(LaneSize, -1); + for (int i = 0; i < Size; ++i) { + if (Mask[i] < 0) + continue; + + int j = i / LaneSize; + + if (Lanes[j] < 0) { + // First entry we've seen for this lane. + Lanes[j] = Mask[i] / LaneSize; + } else if (Lanes[j] != Mask[i] / LaneSize) { + // This doesn't match the lane selected previously! + return SDValue(); + } + + // Check that within each lane we have a consistent shuffle mask. + int k = i % LaneSize; + if (InLaneMask[k] < 0) { + InLaneMask[k] = Mask[i] % LaneSize; + } else if (InLaneMask[k] != Mask[i] % LaneSize) { + // This doesn't fit a repeating in-lane mask. + return SDValue(); + } + } + + // First shuffle the lanes into place. + MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64, + VT.getSizeInBits() / 64); + SmallVector<int, 8> LaneMask; + LaneMask.resize(NumLanes * 2, -1); + for (int i = 0; i < NumLanes; ++i) + if (Lanes[i] >= 0) { + LaneMask[2 * i + 0] = 2*Lanes[i] + 0; + LaneMask[2 * i + 1] = 2*Lanes[i] + 1; + } + + V1 = DAG.getBitcast(LaneVT, V1); + V2 = DAG.getBitcast(LaneVT, V2); + SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask); + + // Cast it back to the type we actually want. + LaneShuffle = DAG.getBitcast(VT, LaneShuffle); + + // Now do a simple shuffle that isn't lane crossing. + SmallVector<int, 8> NewMask; + NewMask.resize(Size, -1); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0) + NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize; + assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) && + "Must not introduce lane crosses at this point!"); + + return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask); +} + +/// Lower shuffles where an entire half of a 256-bit vector is UNDEF. +/// This allows for fast cases such as subvector extraction/insertion +/// or shuffling smaller vector types which can lower more efficiently. +static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(VT.getSizeInBits() == 256 && "Expected 256-bit vector"); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfNumElts = NumElts / 2; + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); + + bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts); + bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts); + if (!UndefLower && !UndefUpper) + return SDValue(); + + // Upper half is undef and lower half is whole upper subvector. + // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> + if (UndefUpper && + isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(HalfNumElts, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(0, DL)); + } + + // Lower half is undef and upper half is whole lower subvector. + // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> + if (UndefLower && + isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(HalfNumElts, DL)); + } + + // AVX2 supports efficient immediate 64-bit element cross-lane shuffles. + if (UndefLower && Subtarget->hasAVX2() && + (VT == MVT::v4f64 || VT == MVT::v4i64)) + return SDValue(); + + // If the shuffle only uses the lower halves of the input operands, + // then extract them and perform the 'half' shuffle at half width. + // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u> + int HalfIdx1 = -1, HalfIdx2 = -1; + SmallVector<int, 8> HalfMask; + unsigned Offset = UndefLower ? HalfNumElts : 0; + for (unsigned i = 0; i != HalfNumElts; ++i) { + int M = Mask[i + Offset]; + if (M < 0) { + HalfMask.push_back(M); + continue; + } + + // Determine which of the 4 half vectors this element is from. + // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2. + int HalfIdx = M / HalfNumElts; + + // Only shuffle using the lower halves of the inputs. + // TODO: Investigate usefulness of shuffling with upper halves. + if (HalfIdx != 0 && HalfIdx != 2) + return SDValue(); + + // Determine the element index into its half vector source. + int HalfElt = M % HalfNumElts; + + // We can shuffle with up to 2 half vectors, set the new 'half' + // shuffle mask accordingly. + if (-1 == HalfIdx1 || HalfIdx1 == HalfIdx) { + HalfMask.push_back(HalfElt); + HalfIdx1 = HalfIdx; + continue; + } + if (-1 == HalfIdx2 || HalfIdx2 == HalfIdx) { + HalfMask.push_back(HalfElt + HalfNumElts); + HalfIdx2 = HalfIdx; + continue; + } + + // Too many half vectors referenced. + return SDValue(); + } + assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); + + auto GetHalfVector = [&](int HalfIdx) { + if (HalfIdx < 0) + return DAG.getUNDEF(HalfVT); + SDValue V = (HalfIdx < 2 ? V1 : V2); + HalfIdx = (HalfIdx % 2) * HalfNumElts; + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V, + DAG.getIntPtrConstant(HalfIdx, DL)); + }; + + SDValue Half1 = GetHalfVector(HalfIdx1); + SDValue Half2 = GetHalfVector(HalfIdx2); + SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, + DAG.getIntPtrConstant(Offset, DL)); +} + +/// \brief Test whether the specified input (0 or 1) is in-place blended by the +/// given mask. +/// +/// This returns true if the elements from a particular input are already in the +/// slot required by the given mask and require no permutation. +static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) { + assert((Input == 0 || Input == 1) && "Only two inputs to shuffles."); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i) + return false; + + return true; +} + +static SDValue lowerVectorShuffleWithSHUFPD(SDLoc DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { + + // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, .. + // Mask for V4F64; 0/1, 4/5, 2/3, 6/7.. + assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD"); + int NumElts = VT.getVectorNumElements(); + bool ShufpdMask = true; + bool CommutableMask = true; + unsigned Immediate = 0; + for (int i = 0; i < NumElts; ++i) { + if (Mask[i] < 0) + continue; + int Val = (i & 6) + NumElts * (i & 1); + int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1); + if (Mask[i] < Val || Mask[i] > Val + 1) + ShufpdMask = false; + if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1) + CommutableMask = false; + Immediate |= (Mask[i] % 2) << i; + } + if (ShufpdMask) + return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, + DAG.getConstant(Immediate, DL, MVT::i8)); + if (CommutableMask) + return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, + DAG.getConstant(Immediate, DL, MVT::i8)); + return SDValue(); +} + +/// \brief Handle lowering of 4-lane 64-bit floating point shuffles. +/// +/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 +/// isn't available. +static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + SmallVector<int, 4> WidenedMask; + if (canWidenShuffleElements(Mask, WidenedMask)) + return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget, + DAG); + + if (isSingleInputShuffleMask(Mask)) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // Use low duplicate instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) + return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1); + + if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { + // Non-half-crossing single input shuffles can be lowerid with an + // interleaved permutation. + unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | + ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1, + DAG.getConstant(VPERMILPMask, DL, MVT::i8)); + } + + // With AVX2 we have direct support for this permutation. + if (Subtarget->hasAVX2()) + return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1, + getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); + + // Otherwise, fall back. + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, + DAG); + } + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG)) + return V; + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check if the blend happens to exactly fit that of SHUFPD. + if (SDValue Op = + lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG)) + return Op; + + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. However, if we have AVX2 and either inputs are already in place, + // we will be able to shuffle even across lanes the other input in a single + // instruction so skip this pattern. + if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || + isShuffleMaskInputInPlace(1, Mask)))) + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) + return Result; + + // If we have AVX2 then we always want to lower with a blend because an v4 we + // can fully permute the elements. + if (Subtarget->hasAVX2()) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, + Mask, DAG); + + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 4-lane 64-bit integer shuffles. +/// +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v4i64 shuffling.. +static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!"); + + SmallVector<int, 4> WidenedMask; + if (canWidenShuffleElements(Mask, WidenedMask)) + return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget, + DAG); + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // When the shuffle is mirrored between the 128-bit lanes of the unit, we can + // use lower latency instructions that will operate on both 128-bit lanes. + SmallVector<int, 2> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { + if (isSingleInputShuffleMask(Mask)) { + int PSHUFDMask[] = {-1, -1, -1, -1}; + for (int i = 0; i < 2; ++i) + if (RepeatedMask[i] >= 0) { + PSHUFDMask[2 * i] = 2 * RepeatedMask[i]; + PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1; + } + return DAG.getBitcast( + MVT::v4i64, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, + DAG.getBitcast(MVT::v8i32, V1), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); + } + } + + // AVX2 provides a direct instruction for permuting a single input across + // lanes. + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, + getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); + + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG)) + return Shift; + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) + return V; + + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. However, if we have AVX2 and either inputs are already in place, + // we will be able to shuffle even across lanes the other input in a single + // instruction so skip this pattern. + if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || + isShuffleMaskInputInPlace(1, Mask)))) + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) + return Result; + + // Otherwise fall back on generic blend lowering. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, + Mask, DAG); +} + +/// \brief Handle lowering of 8-lane 32-bit floating point shuffles. +/// +/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 +/// isn't available. +static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // If the shuffle mask is repeated in each 128-bit lane, we have many more + // options to efficiently lower the shuffle. + SmallVector<int, 4> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) { + assert(RepeatedMask.size() == 4 && + "Repeated masks must be half the mask width!"); + + // Use even/odd duplicate instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6})) + return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1); + if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7})) + return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1); + + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, + getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG)) + return V; + + // Otherwise, fall back to a SHUFPS sequence. Here it is important that we + // have already handled any direct blends. We also need to squash the + // repeated mask into a simulated v4f32 mask. + for (int i = 0; i < 4; ++i) + if (RepeatedMask[i] >= 8) + RepeatedMask[i] -= 4; + return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); + } + + // If we have a single input shuffle with different shuffle patterns in the + // two 128-bit lanes use the variable mask to VPERMILPS. + if (isSingleInputShuffleMask(Mask)) { + SDValue VPermMask[8]; + for (int i = 0; i < 8; ++i) + VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) + : DAG.getConstant(Mask[i], DL, MVT::i32); + if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) + return DAG.getNode( + X86ISD::VPERMILPV, DL, MVT::v8f32, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask)); + + if (Subtarget->hasAVX2()) + return DAG.getNode( + X86ISD::VPERMV, DL, MVT::v8f32, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); + + // Otherwise, fall back. + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, + DAG); + } + + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) + return Result; + + // If we have AVX2 then we always want to lower with a blend because at v8 we + // can fully permute the elements. + if (Subtarget->hasAVX2()) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, + Mask, DAG); + + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 8-lane 32-bit integer shuffles. +/// +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v8i32 shuffling.. +static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!"); + + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. It also allows us to fold memory operands into the + // shuffle in many cases. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, + Mask, Subtarget, DAG)) + return ZExt; + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // If the shuffle mask is repeated in each 128-bit lane we can use more + // efficient instructions that mirror the shuffles across the two 128-bit + // lanes. + SmallVector<int, 4> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) { + assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, + getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) + return V; + } + + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG)) + return Shift; + + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + + // If the shuffle patterns aren't repeated but it is a single input, directly + // generate a cross-lane VPERMD instruction. + if (isSingleInputShuffleMask(Mask)) { + SDValue VPermMask[8]; + for (int i = 0; i < 8; ++i) + VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) + : DAG.getConstant(Mask[i], DL, MVT::i32); + return DAG.getNode( + X86ISD::VPERMV, DL, MVT::v8i32, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); + } + + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) + return Result; + + // Otherwise fall back on generic blend lowering. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, + Mask, DAG); +} + +/// \brief Handle lowering of 16-lane 16-bit integer shuffles. +/// +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v16i16 shuffling.. +static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!"); + + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. It also allows us to fold memory operands into the + // shuffle in many cases. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2, + Mask, Subtarget, DAG)) + return ZExt; + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) + return V; + + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG)) + return Shift; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + + if (isSingleInputShuffleMask(Mask)) { + // There are no generalized cross-lane shuffle operations available on i16 + // element types. + if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, + Mask, DAG); + + SmallVector<int, 8> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { + // As this is a single-input shuffle, the repeated mask should be + // a strictly valid v8i16 mask that we can pass through to the v8i16 + // lowering to handle even the v16 case. + return lowerV8I16GeneralSingleInputVectorShuffle( + DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG); + } + + SDValue PSHUFBMask[32]; + for (int i = 0; i < 16; ++i) { + if (Mask[i] == -1) { + PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8); + continue; + } + + int M = i < 8 ? Mask[i] : Mask[i] - 8; + assert(M >= 0 && M < 8 && "Invalid single-input mask!"); + PSHUFBMask[2 * i] = DAG.getConstant(2 * M, DL, MVT::i8); + PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, DL, MVT::i8); + } + return DAG.getBitcast(MVT::v16i16, + DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, + DAG.getBitcast(MVT::v32i8, V1), + DAG.getNode(ISD::BUILD_VECTOR, DL, + MVT::v32i8, PSHUFBMask))); + } + + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) + return Result; + + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 32-lane 8-bit integer shuffles. +/// +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v32i8 shuffling.. +static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!"); + + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. It also allows us to fold memory operands into the + // shuffle in many cases. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, + Mask, Subtarget, DAG)) + return ZExt; + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) + return V; + + // Try to use shift instructions. + if (SDValue Shift = + lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG)) + return Shift; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + + if (isSingleInputShuffleMask(Mask)) { + // There are no generalized cross-lane shuffle operations available on i8 + // element types. + if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, + Mask, DAG); + + SDValue PSHUFBMask[32]; + for (int i = 0; i < 32; ++i) + PSHUFBMask[i] = + Mask[i] < 0 + ? DAG.getUNDEF(MVT::i8) + : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, DL, + MVT::i8); + + return DAG.getNode( + X86ISD::PSHUFB, DL, MVT::v32i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)); + } + + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) + return Result; + + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG); +} + +/// \brief High-level routine to lower various 256-bit x86 vector shuffles. +/// +/// This routine either breaks down the specific type of a 256-bit x86 vector +/// shuffle or splits it into two 128-bit shuffles and fuses the results back +/// together based on the available instructions. +static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + + // If we have a single input to the zero element, insert that into V1 if we + // can do so cheaply. + int NumElts = VT.getVectorNumElements(); + int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [NumElts](int M) { + return M >= NumElts; + }); + + if (NumV2Elements == 1 && Mask[0] >= NumElts) + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + DL, VT, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + + // Handle special cases where the lower or upper half is UNDEF. + if (SDValue V = + lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) + return V; + + // There is a really nice hard cut-over between AVX1 and AVX2 that means we + // can check for those subtargets here and avoid much of the subtarget + // querying in the per-vector-type lowering routines. With AVX1 we have + // essentially *zero* ability to manipulate a 256-bit vector with integer + // types. Since we'll use floating point types there eventually, just + // immediately cast everything to a float and operate entirely in that domain. + if (VT.isInteger() && !Subtarget->hasAVX2()) { + int ElementBits = VT.getScalarSizeInBits(); + if (ElementBits < 32) + // No floating point type available, decompose into 128-bit vectors. + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + + MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), + VT.getVectorNumElements()); + V1 = DAG.getBitcast(FpVT, V1); + V2 = DAG.getBitcast(FpVT, V2); + return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask)); + } + + switch (VT.SimpleTy) { + case MVT::v4f64: + return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v4i64: + return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8f32: + return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8i32: + return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v16i16: + return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v32i8: + return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG); + + default: + llvm_unreachable("Not a valid 256-bit x86 vector type!"); + } +} + +/// \brief Try to lower a vector shuffle as a 128-bit shuffles. +static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT, + ArrayRef<int> Mask, + SDValue V1, SDValue V2, + SelectionDAG &DAG) { + assert(VT.getScalarSizeInBits() == 64 && + "Unexpected element type size for 128bit shuffle."); + + // To handle 256 bit vector requires VLX and most probably + // function lowerV2X128VectorShuffle() is better solution. + assert(VT.is512BitVector() && "Unexpected vector size for 128bit shuffle."); + + SmallVector<int, 4> WidenedMask; + if (!canWidenShuffleElements(Mask, WidenedMask)) + return SDValue(); + + // Form a 128-bit permutation. + // Convert the 64-bit shuffle mask selection values into 128-bit selection + // bits defined by a vshuf64x2 instruction's immediate control byte. + unsigned PermMask = 0, Imm = 0; + unsigned ControlBitsNum = WidenedMask.size() / 2; + + for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) { + if (WidenedMask[i] == SM_SentinelZero) + return SDValue(); + + // Use first element in place of undef mask. + Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i]; + PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum); + } + + return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, + DAG.getConstant(PermMask, DL, MVT::i8)); +} + +static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { + + assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV"); + + MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); + MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); + + SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true); + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); + + return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); +} + +/// \brief Handle lowering of 8-lane 64-bit floating point shuffles. +static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + + if (SDValue Shuf128 = + lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG)) + return Shuf128; + + if (SDValue Unpck = + lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) + return Unpck; + + return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); +} + +/// \brief Handle lowering of 16-lane 32-bit floating point shuffles. +static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + + if (SDValue Unpck = + lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) + return Unpck; + + return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); +} + +/// \brief Handle lowering of 8-lane 64-bit integer shuffles. +static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + + if (SDValue Shuf128 = + lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) + return Shuf128; + + if (SDValue Unpck = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) + return Unpck; + + return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); +} + +/// \brief Handle lowering of 16-lane 32-bit integer shuffles. +static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + + if (SDValue Unpck = + lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) + return Unpck; + + return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); +} + +/// \brief Handle lowering of 32-lane 16-bit integer shuffles. +static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); + assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); + + return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); +} + +/// \brief Handle lowering of 64-lane 8-bit integer shuffles. +static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); + assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); +} + +/// \brief High-level routine to lower various 512-bit x86 vector shuffles. +/// +/// This routine either breaks down the specific type of a 512-bit x86 vector +/// shuffle or splits it into two 256-bit shuffles and fuses the results back +/// together based on the available instructions. +static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Subtarget->hasAVX512() && + "Cannot lower 512-bit vectors w/ basic ISA!"); + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = + lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG)) + return Broadcast; + + // Dispatch to each element type for lowering. If we don't have supprot for + // specific element type shuffles at 512 bits, immediately split them and + // lower them. Each lowering routine of a given type is allowed to assume that + // the requisite ISA extensions for that element type are available. + switch (VT.SimpleTy) { + case MVT::v8f64: + return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v16f32: + return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8i64: + return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v16i32: + return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v32i16: + if (Subtarget->hasBWI()) + return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG); + break; + case MVT::v64i8: + if (Subtarget->hasBWI()) + return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG); + break; + + default: + llvm_unreachable("Not a valid 512-bit x86 vector type!"); + } + + // Otherwise fall back on splitting. + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); +} + +// Lower vXi1 vector shuffles. +// There is no a dedicated instruction on AVX-512 that shuffles the masks. +// The only way to shuffle bits is to sign-extend the mask vector to SIMD +// vector, shuffle and then truncate it back. +static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Subtarget->hasAVX512() && + "Cannot lower 512-bit vectors w/o basic ISA!"); + MVT ExtVT; + switch (VT.SimpleTy) { + default: + llvm_unreachable("Expected a vector of i1 elements"); + case MVT::v2i1: + ExtVT = MVT::v2i64; + break; + case MVT::v4i1: + ExtVT = MVT::v4i32; + break; + case MVT::v8i1: + ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL + break; + case MVT::v16i1: + ExtVT = MVT::v16i32; + break; + case MVT::v32i1: + ExtVT = MVT::v32i16; + break; + case MVT::v64i1: + ExtVT = MVT::v64i8; + break; + } + + if (ISD::isBuildVectorAllZeros(V1.getNode())) + V1 = getZeroVector(ExtVT, Subtarget, DAG, DL); + else if (ISD::isBuildVectorAllOnes(V1.getNode())) + V1 = getOnesVector(ExtVT, Subtarget, DAG, DL); + else + V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1); + + if (V2.isUndef()) + V2 = DAG.getUNDEF(ExtVT); + else if (ISD::isBuildVectorAllZeros(V2.getNode())) + V2 = getZeroVector(ExtVT, Subtarget, DAG, DL); + else if (ISD::isBuildVectorAllOnes(V2.getNode())) + V2 = getOnesVector(ExtVT, Subtarget, DAG, DL); + else + V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2); + return DAG.getNode(ISD::TRUNCATE, DL, VT, + DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask)); +} +/// \brief Top-level lowering for x86 vector shuffles. +/// +/// This handles decomposition, canonicalization, and lowering of all x86 +/// vector shuffles. Most of the specific lowering strategies are encapsulated +/// above in helper routines. The canonicalization attempts to widen shuffles +/// to involve fewer lanes of wider elements, consolidate symmetric patterns +/// s.t. only one of the two inputs needs to be tested, etc. +static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + MVT VT = Op.getSimpleValueType(); + int NumElements = VT.getVectorNumElements(); + SDLoc dl(Op); + bool Is1BitVector = (VT.getVectorElementType() == MVT::i1); + + assert((VT.getSizeInBits() != 64 || Is1BitVector) && + "Can't lower MMX shuffles"); + + bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; + bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; + if (V1IsUndef && V2IsUndef) + return DAG.getUNDEF(VT); + + // When we create a shuffle node we put the UNDEF node to second operand, + // but in some cases the first operand may be transformed to UNDEF. + // In this case we should just commute the node. + if (V1IsUndef) + return DAG.getCommutedVectorShuffle(*SVOp); + + // Check for non-undef masks pointing at an undef vector and make the masks + // undef as well. This makes it easier to match the shuffle based solely on + // the mask. + if (V2IsUndef) + for (int M : Mask) + if (M >= NumElements) { + SmallVector<int, 8> NewMask(Mask.begin(), Mask.end()); + for (int &M : NewMask) + if (M >= NumElements) + M = -1; + return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask); + } + + // We actually see shuffles that are entirely re-arrangements of a set of + // zero inputs. This mostly happens while decomposing complex shuffles into + // simple ones. Directly lower these as a buildvector of zeros. + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + if (Zeroable.all()) + return getZeroVector(VT, Subtarget, DAG, dl); + + // Try to collapse shuffles into using a vector type with fewer elements but + // wider element types. We cap this to not form integers or floating point + // elements wider than 64 bits, but it might be interesting to form i128 + // integers to handle flipping the low and high halves of AVX 256-bit vectors. + SmallVector<int, 16> WidenedMask; + if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && + canWidenShuffleElements(Mask, WidenedMask)) { + MVT NewEltVT = VT.isFloatingPoint() + ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) + : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2); + MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); + // Make sure that the new vector type is legal. For example, v2f64 isn't + // legal on SSE1. + if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { + V1 = DAG.getBitcast(NewVT, V1); + V2 = DAG.getBitcast(NewVT, V2); + return DAG.getBitcast( + VT, DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask)); + } + } + + int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; + for (int M : SVOp->getMask()) + if (M < 0) + ++NumUndefElements; + else if (M < NumElements) + ++NumV1Elements; + else + ++NumV2Elements; + + // Commute the shuffle as needed such that more elements come from V1 than + // V2. This allows us to match the shuffle pattern strictly on how many + // elements come from V1 without handling the symmetric cases. + if (NumV2Elements > NumV1Elements) + return DAG.getCommutedVectorShuffle(*SVOp); + + // When the number of V1 and V2 elements are the same, try to minimize the + // number of uses of V2 in the low half of the vector. When that is tied, + // ensure that the sum of indices for V1 is equal to or lower than the sum + // indices for V2. When those are equal, try to ensure that the number of odd + // indices for V1 is lower than the number of odd indices for V2. + if (NumV1Elements == NumV2Elements) { + int LowV1Elements = 0, LowV2Elements = 0; + for (int M : SVOp->getMask().slice(0, NumElements / 2)) + if (M >= NumElements) + ++LowV2Elements; + else if (M >= 0) + ++LowV1Elements; + if (LowV2Elements > LowV1Elements) { + return DAG.getCommutedVectorShuffle(*SVOp); + } else if (LowV2Elements == LowV1Elements) { + int SumV1Indices = 0, SumV2Indices = 0; + for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i) + if (SVOp->getMask()[i] >= NumElements) + SumV2Indices += i; + else if (SVOp->getMask()[i] >= 0) + SumV1Indices += i; + if (SumV2Indices < SumV1Indices) { + return DAG.getCommutedVectorShuffle(*SVOp); + } else if (SumV2Indices == SumV1Indices) { + int NumV1OddIndices = 0, NumV2OddIndices = 0; + for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i) + if (SVOp->getMask()[i] >= NumElements) + NumV2OddIndices += i % 2; + else if (SVOp->getMask()[i] >= 0) + NumV1OddIndices += i % 2; + if (NumV2OddIndices < NumV1OddIndices) + return DAG.getCommutedVectorShuffle(*SVOp); + } + } + } + + // For each vector width, delegate to a specialized lowering routine. + if (VT.is128BitVector()) + return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + + if (VT.is256BitVector()) + return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + + if (VT.is512BitVector()) + return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + + if (Is1BitVector) + return lower1BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + llvm_unreachable("Unimplemented!"); +} + +// This function assumes its argument is a BUILD_VECTOR of constants or +// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is +// true. +static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, + unsigned &MaskValue) { + MaskValue = 0; + unsigned NumElems = BuildVector->getNumOperands(); + + // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. + // We don't handle the >2 lanes case right now. + unsigned NumLanes = (NumElems - 1) / 8 + 1; + if (NumLanes > 2) + return false; + + unsigned NumElemsInLane = NumElems / NumLanes; + + // Blend for v16i16 should be symmetric for the both lanes. + for (unsigned i = 0; i < NumElemsInLane; ++i) { + SDValue EltCond = BuildVector->getOperand(i); + SDValue SndLaneEltCond = + (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond; + + int Lane1Cond = -1, Lane2Cond = -1; + if (isa<ConstantSDNode>(EltCond)) + Lane1Cond = !isNullConstant(EltCond); + if (isa<ConstantSDNode>(SndLaneEltCond)) + Lane2Cond = !isNullConstant(SndLaneEltCond); + + unsigned LaneMask = 0; + if (Lane1Cond == Lane2Cond || Lane2Cond < 0) + // Lane1Cond != 0, means we want the first argument. + // Lane1Cond == 0, means we want the second argument. + // The encoding of this argument is 0 for the first argument, 1 + // for the second. Therefore, invert the condition. + LaneMask = !Lane1Cond << i; + else if (Lane1Cond < 0) + LaneMask = !Lane2Cond << i; + else + return false; + + MaskValue |= LaneMask; + if (NumLanes == 2) + MaskValue |= LaneMask << NumElemsInLane; + } + return true; +} + +/// \brief Try to lower a VSELECT instruction to a vector shuffle. +static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDValue Cond = Op.getOperand(0); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + + if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) + return SDValue(); + auto *CondBV = cast<BuildVectorSDNode>(Cond); + + // Only non-legal VSELECTs reach this lowering, convert those into generic + // shuffles and re-use the shuffle lowering path for blends. + SmallVector<int, 32> Mask; + for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) { + SDValue CondElt = CondBV->getOperand(i); + Mask.push_back( + isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0) + : -1); + } + return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask); +} + +SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { + // A vselect where all conditions and data are constants can be optimized into + // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). + if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) && + ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) && + ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) + return SDValue(); + + // Try to lower this to a blend-style vector shuffle. This can handle all + // constant condition cases. + if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG)) + return BlendOp; + + // Variable blends are only legal from SSE4.1 onward. + if (!Subtarget->hasSSE41()) + return SDValue(); + + // Only some types will be legal on some subtargets. If we can emit a legal + // VSELECT-matching blend, return Op, and but if we need to expand, return + // a null value. + switch (Op.getSimpleValueType().SimpleTy) { + default: + // Most of the vector types have blends past SSE4.1. + return Op; + + case MVT::v32i8: + // The byte blends for AVX vectors were introduced only in AVX2. + if (Subtarget->hasAVX2()) + return Op; + + return SDValue(); + + case MVT::v8i16: + case MVT::v16i16: + // AVX-512 BWI and VLX features support VSELECT with i16 elements. + if (Subtarget->hasBWI() && Subtarget->hasVLX()) + return Op; + + // FIXME: We should custom lower this by fixing the condition and using i8 + // blends. + return SDValue(); + } +} + +static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + SDLoc dl(Op); + + if (!Op.getOperand(0).getSimpleValueType().is128BitVector()) + return SDValue(); + + if (VT.getSizeInBits() == 8) { + SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, + Op.getOperand(0), Op.getOperand(1)); + SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, + DAG.getValueType(VT)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); + } + + if (VT.getSizeInBits() == 16) { + // If Idx is 0, it's cheaper to do a move instead of a pextrw. + if (isNullConstant(Op.getOperand(1))) + return DAG.getNode( + ISD::TRUNCATE, dl, MVT::i16, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + DAG.getBitcast(MVT::v4i32, Op.getOperand(0)), + Op.getOperand(1))); + SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, + Op.getOperand(0), Op.getOperand(1)); + SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, + DAG.getValueType(VT)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); + } + + if (VT == MVT::f32) { + // EXTRACTPS outputs to a GPR32 register which will require a movd to copy + // the result back to FR32 register. It's only worth matching if the + // result has a single use which is a store or a bitcast to i32. And in + // the case of a store, it's not worth it if the index is a constant 0, + // because a MOVSSmr can be used instead, which is smaller and faster. + if (!Op.hasOneUse()) + return SDValue(); + SDNode *User = *Op.getNode()->use_begin(); + if ((User->getOpcode() != ISD::STORE || + isNullConstant(Op.getOperand(1))) && + (User->getOpcode() != ISD::BITCAST || + User->getValueType(0) != MVT::i32)) + return SDValue(); + SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + DAG.getBitcast(MVT::v4i32, Op.getOperand(0)), + Op.getOperand(1)); + return DAG.getBitcast(MVT::f32, Extract); + } + + if (VT == MVT::i32 || VT == MVT::i64) { + // ExtractPS/pextrq works with constant index. + if (isa<ConstantSDNode>(Op.getOperand(1))) + return Op; + } + return SDValue(); +} + +/// Extract one bit from mask vector, like v16i1 or v8i1. +/// AVX-512 feature. +SDValue +X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const { + SDValue Vec = Op.getOperand(0); + SDLoc dl(Vec); + MVT VecVT = Vec.getSimpleValueType(); + SDValue Idx = Op.getOperand(1); + MVT EltVT = Op.getSimpleValueType(); + + assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector"); + assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) && + "Unexpected vector type in ExtractBitFromMaskVector"); + + // variable index can't be handled in mask registers, + // extend vector to VR512 + if (!isa<ConstantSDNode>(Idx)) { + MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); + SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + ExtVT.getVectorElementType(), Ext, Idx); + return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); + } + + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + const TargetRegisterClass* rc = getRegClassFor(VecVT); + if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8)) + rc = getRegClassFor(MVT::v16i1); + unsigned MaxSift = rc->getSize()*8 - 1; + Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); + Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, + DAG.getConstant(MaxSift, dl, MVT::i8)); + return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec, + DAG.getIntPtrConstant(0, dl)); +} + +SDValue +X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + SDValue Vec = Op.getOperand(0); + MVT VecVT = Vec.getSimpleValueType(); + SDValue Idx = Op.getOperand(1); + + if (Op.getSimpleValueType() == MVT::i1) + return ExtractBitFromMaskVector(Op, DAG); + + if (!isa<ConstantSDNode>(Idx)) { + if (VecVT.is512BitVector() || + (VecVT.is256BitVector() && Subtarget->hasInt256() && + VecVT.getVectorElementType().getSizeInBits() == 32)) { + + MVT MaskEltVT = + MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits()); + MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() / + MaskEltVT.getSizeInBits()); + + Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, + getZeroVector(MaskVT, Subtarget, DAG, dl), Idx, + DAG.getConstant(0, dl, PtrVT)); + SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm, + DAG.getConstant(0, dl, PtrVT)); + } + return SDValue(); + } + + // If this is a 256-bit vector result, first extract the 128-bit vector and + // then extract the element from the 128-bit vector. + if (VecVT.is256BitVector() || VecVT.is512BitVector()) { + + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + // Get the 128-bit vector. + Vec = Extract128BitVector(Vec, IdxVal, DAG, dl); + MVT EltVT = VecVT.getVectorElementType(); + + unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); + assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); + + // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2 + // this can be done with a mask. + IdxVal &= ElemsPerChunk - 1; + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, + DAG.getConstant(IdxVal, dl, MVT::i32)); + } + + assert(VecVT.is128BitVector() && "Unexpected vector length"); + + if (Subtarget->hasSSE41()) + if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) + return Res; + + MVT VT = Op.getSimpleValueType(); + // TODO: handle v16i8. + if (VT.getSizeInBits() == 16) { + SDValue Vec = Op.getOperand(0); + if (isNullConstant(Op.getOperand(1))) + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + DAG.getBitcast(MVT::v4i32, Vec), + Op.getOperand(1))); + // Transform it so it match pextrw which produces a 32-bit result. + MVT EltVT = MVT::i32; + SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, + Op.getOperand(0), Op.getOperand(1)); + SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, + DAG.getValueType(VT)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); + } + + if (VT.getSizeInBits() == 32) { + unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + if (Idx == 0) + return Op; + + // SHUFPS the element to the lowest double word, then movss. + int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; + MVT VVT = Op.getOperand(0).getSimpleValueType(); + SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), + DAG.getUNDEF(VVT), Mask); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, + DAG.getIntPtrConstant(0, dl)); + } + + if (VT.getSizeInBits() == 64) { + // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b + // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught + // to match extract_elt for f64. + if (isNullConstant(Op.getOperand(1))) + return Op; + + // UNPCKHPD the element to the lowest double word, then movsd. + // Note if the lower 64 bits of the result of the UNPCKHPD is then stored + // to a f64mem, the whole operation is folded into a single MOVHPDmr. + int Mask[2] = { 1, -1 }; + MVT VVT = Op.getOperand(0).getSimpleValueType(); + SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), + DAG.getUNDEF(VVT), Mask); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, + DAG.getIntPtrConstant(0, dl)); + } + + return SDValue(); +} + +/// Insert one bit to mask vector, like v16i1 or v8i1. +/// AVX-512 feature. +SDValue +X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + SDValue Vec = Op.getOperand(0); + SDValue Elt = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + MVT VecVT = Vec.getSimpleValueType(); + + if (!isa<ConstantSDNode>(Idx)) { + // Non constant index. Extend source and destination, + // insert element and then truncate the result. + MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); + MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32); + SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, + DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), + DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); + return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); + } + + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt); + if (IdxVal) + EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + if (Vec.getOpcode() == ISD::UNDEF) + return EltInVec; + return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); +} + +SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + MVT VT = Op.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + + if (EltVT == MVT::i1) + return InsertBitToMaskVector(Op, DAG); + + SDLoc dl(Op); + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + SDValue N2 = Op.getOperand(2); + if (!isa<ConstantSDNode>(N2)) + return SDValue(); + auto *N2C = cast<ConstantSDNode>(N2); + unsigned IdxVal = N2C->getZExtValue(); + + // If the vector is wider than 128 bits, extract the 128-bit subvector, insert + // into that, and then insert the subvector back into the result. + if (VT.is256BitVector() || VT.is512BitVector()) { + // With a 256-bit vector, we can insert into the zero element efficiently + // using a blend if we have AVX or AVX2 and the right data type. + if (VT.is256BitVector() && IdxVal == 0) { + // TODO: It is worthwhile to cast integer to floating point and back + // and incur a domain crossing penalty if that's what we'll end up + // doing anyway after extracting to a 128-bit vector. + if ((Subtarget->hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || + (Subtarget->hasAVX2() && EltVT == MVT::i32)) { + SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); + N2 = DAG.getIntPtrConstant(1, dl); + return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2); + } + } + + // Get the desired 128-bit vector chunk. + SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); + + // Insert the element into the desired chunk. + unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); + assert(isPowerOf2_32(NumEltsIn128)); + // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo. + unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1); + + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, + DAG.getConstant(IdxIn128, dl, MVT::i32)); + + // Insert the changed part back into the bigger vector + return Insert128BitVector(N0, V, IdxVal, DAG, dl); + } + assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); + + if (Subtarget->hasSSE41()) { + if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) { + unsigned Opc; + if (VT == MVT::v8i16) { + Opc = X86ISD::PINSRW; + } else { + assert(VT == MVT::v16i8); + Opc = X86ISD::PINSRB; + } + + // Transform it so it match pinsr{b,w} which expects a GR32 as its second + // argument. + if (N1.getValueType() != MVT::i32) + N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); + if (N2.getValueType() != MVT::i32) + N2 = DAG.getIntPtrConstant(IdxVal, dl); + return DAG.getNode(Opc, dl, VT, N0, N1, N2); + } + + if (EltVT == MVT::f32) { + // Bits [7:6] of the constant are the source select. This will always be + // zero here. The DAG Combiner may combine an extract_elt index into + // these bits. For example (insert (extract, 3), 2) could be matched by + // putting the '3' into bits [7:6] of X86ISD::INSERTPS. + // Bits [5:4] of the constant are the destination select. This is the + // value of the incoming immediate. + // Bits [3:0] of the constant are the zero mask. The DAG Combiner may + // combine either bitwise AND or insert of float 0.0 to set these bits. + + bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize(); + if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { + // If this is an insertion of 32-bits into the low 32-bits of + // a vector, we prefer to generate a blend with immediate rather + // than an insertps. Blends are simpler operations in hardware and so + // will always have equal or better performance than insertps. + // But if optimizing for size and there's a load folding opportunity, + // generate insertps because blendps does not have a 32-bit memory + // operand form. + N2 = DAG.getIntPtrConstant(1, dl); + N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); + return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2); + } + N2 = DAG.getIntPtrConstant(IdxVal << 4, dl); + // Create this as a scalar to vector.. + N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); + } + + if (EltVT == MVT::i32 || EltVT == MVT::i64) { + // PINSR* works with constant index. + return Op; + } + } + + if (EltVT == MVT::i8) + return SDValue(); + + if (EltVT.getSizeInBits() == 16) { + // Transform it so it match pinsrw which expects a 16-bit value in a GR32 + // as its second argument. + if (N1.getValueType() != MVT::i32) + N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); + if (N2.getValueType() != MVT::i32) + N2 = DAG.getIntPtrConstant(IdxVal, dl); + return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); + } + return SDValue(); +} + +static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { + SDLoc dl(Op); + MVT OpVT = Op.getSimpleValueType(); + + // If this is a 256-bit vector result, first insert into a 128-bit + // vector and then insert into the 256-bit vector. + if (!OpVT.is128BitVector()) { + // Insert into a 128-bit vector. + unsigned SizeFactor = OpVT.getSizeInBits()/128; + MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), + OpVT.getVectorNumElements() / SizeFactor); + + Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); + + // Insert the 128-bit vector. + return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); + } + + if (OpVT == MVT::v1i64 && + Op.getOperand(0).getValueType() == MVT::i64) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); + + SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); + assert(OpVT.is128BitVector() && "Expected an SSE type!"); + return DAG.getBitcast( + OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt)); +} + +// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in +// a simple subregister reference or explicit instructions to grab +// upper bits of a vector. +static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc dl(Op); + SDValue In = Op.getOperand(0); + SDValue Idx = Op.getOperand(1); + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + MVT ResVT = Op.getSimpleValueType(); + MVT InVT = In.getSimpleValueType(); + + if (Subtarget->hasFp256()) { + if (ResVT.is128BitVector() && + (InVT.is256BitVector() || InVT.is512BitVector()) && + isa<ConstantSDNode>(Idx)) { + return Extract128BitVector(In, IdxVal, DAG, dl); + } + if (ResVT.is256BitVector() && InVT.is512BitVector() && + isa<ConstantSDNode>(Idx)) { + return Extract256BitVector(In, IdxVal, DAG, dl); + } + } + return SDValue(); +} + +// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a +// simple superregister reference or explicit instructions to insert +// the upper bits of a vector. +static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + if (!Subtarget->hasAVX()) + return SDValue(); + + SDLoc dl(Op); + SDValue Vec = Op.getOperand(0); + SDValue SubVec = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + + if (!isa<ConstantSDNode>(Idx)) + return SDValue(); + + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + MVT OpVT = Op.getSimpleValueType(); + MVT SubVecVT = SubVec.getSimpleValueType(); + + // Fold two 16-byte subvector loads into one 32-byte load: + // (insert_subvector (insert_subvector undef, (load addr), 0), + // (load addr + 16), Elts/2) + // --> load32 addr + if ((IdxVal == OpVT.getVectorNumElements() / 2) && + Vec.getOpcode() == ISD::INSERT_SUBVECTOR && + OpVT.is256BitVector() && SubVecVT.is128BitVector()) { + auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2)); + if (Idx2 && Idx2->getZExtValue() == 0) { + SDValue SubVec2 = Vec.getOperand(1); + // If needed, look through a bitcast to get to the load. + if (SubVec2.getNode() && SubVec2.getOpcode() == ISD::BITCAST) + SubVec2 = SubVec2.getOperand(0); + + if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) { + bool Fast; + unsigned Alignment = FirstLd->getAlignment(); + unsigned AS = FirstLd->getAddressSpace(); + const X86TargetLowering *TLI = Subtarget->getTargetLowering(); + if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + OpVT, AS, Alignment, &Fast) && Fast) { + SDValue Ops[] = { SubVec2, SubVec }; + if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) + return Ld; + } + } + } + } + + if ((OpVT.is256BitVector() || OpVT.is512BitVector()) && + SubVecVT.is128BitVector()) + return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); + + if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) + return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); + + if (OpVT.getVectorElementType() == MVT::i1) + return Insert1BitVector(Op, DAG); + + return SDValue(); +} + +// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as +// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is +// one of the above mentioned nodes. It has to be wrapped because otherwise +// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only +// be used to form addressing mode. These wrapped nodes will be selected +// into MOV32ri. +SDValue +X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { + ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); + + // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the + // global base reg. + unsigned char OpFlag = 0; + unsigned WrapperKind = X86ISD::Wrapper; + CodeModel::Model M = DAG.getTarget().getCodeModel(); + + if (Subtarget->isPICStyleRIPRel() && + (M == CodeModel::Small || M == CodeModel::Kernel)) + WrapperKind = X86ISD::WrapperRIP; + else if (Subtarget->isPICStyleGOT()) + OpFlag = X86II::MO_GOTOFF; + else if (Subtarget->isPICStyleStubPIC()) + OpFlag = X86II::MO_PIC_BASE_OFFSET; + + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Result = DAG.getTargetConstantPool( + CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag); + SDLoc DL(CP); + Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); + // With PIC, the address is actually $g + Offset. + if (OpFlag) { + Result = + DAG.getNode(ISD::ADD, DL, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); + } + + return Result; +} + +SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { + JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); + + // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the + // global base reg. + unsigned char OpFlag = 0; + unsigned WrapperKind = X86ISD::Wrapper; + CodeModel::Model M = DAG.getTarget().getCodeModel(); + + if (Subtarget->isPICStyleRIPRel() && + (M == CodeModel::Small || M == CodeModel::Kernel)) + WrapperKind = X86ISD::WrapperRIP; + else if (Subtarget->isPICStyleGOT()) + OpFlag = X86II::MO_GOTOFF; + else if (Subtarget->isPICStyleStubPIC()) + OpFlag = X86II::MO_PIC_BASE_OFFSET; + + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag); + SDLoc DL(JT); + Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); + + // With PIC, the address is actually $g + Offset. + if (OpFlag) + Result = + DAG.getNode(ISD::ADD, DL, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); + + return Result; +} + +SDValue +X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { + const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); + + // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the + // global base reg. + unsigned char OpFlag = 0; + unsigned WrapperKind = X86ISD::Wrapper; + CodeModel::Model M = DAG.getTarget().getCodeModel(); + + if (Subtarget->isPICStyleRIPRel() && + (M == CodeModel::Small || M == CodeModel::Kernel)) { + if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) + OpFlag = X86II::MO_GOTPCREL; + WrapperKind = X86ISD::WrapperRIP; + } else if (Subtarget->isPICStyleGOT()) { + OpFlag = X86II::MO_GOT; + } else if (Subtarget->isPICStyleStubPIC()) { + OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; + } else if (Subtarget->isPICStyleStubNoDynamic()) { + OpFlag = X86II::MO_DARWIN_NONLAZY; + } + + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag); + + SDLoc DL(Op); + Result = DAG.getNode(WrapperKind, DL, PtrVT, Result); + + // With PIC, the address is actually $g + Offset. + if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ && + !Subtarget->is64Bit()) { + Result = + DAG.getNode(ISD::ADD, DL, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); + } + + // For symbols that require a load from a stub to get the address, emit the + // load. + if (isGlobalStubReference(OpFlag)) + Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); + + return Result; +} + +SDValue +X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { + // Create the TargetBlockAddressAddress node. + unsigned char OpFlags = + Subtarget->ClassifyBlockAddressReference(); + CodeModel::Model M = DAG.getTarget().getCodeModel(); + const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); + int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); + SDLoc dl(Op); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags); + + if (Subtarget->isPICStyleRIPRel() && + (M == CodeModel::Small || M == CodeModel::Kernel)) + Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result); + else + Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result); + + // With PIC, the address is actually $g + Offset. + if (isGlobalRelativeToPICBase(OpFlags)) { + Result = DAG.getNode(ISD::ADD, dl, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); + } + + return Result; +} + +SDValue +X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, + int64_t Offset, SelectionDAG &DAG) const { + // Create the TargetGlobalAddress node, folding in the constant + // offset if it is legal. + unsigned char OpFlags = + Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()); + CodeModel::Model M = DAG.getTarget().getCodeModel(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Result; + if (OpFlags == X86II::MO_NO_FLAG && + X86::isOffsetSuitableForCodeModel(Offset, M)) { + // A direct static reference to a global. + Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset); + Offset = 0; + } else { + Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags); + } + + if (Subtarget->isPICStyleRIPRel() && + (M == CodeModel::Small || M == CodeModel::Kernel)) + Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result); + else + Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result); + + // With PIC, the address is actually $g + Offset. + if (isGlobalRelativeToPICBase(OpFlags)) { + Result = DAG.getNode(ISD::ADD, dl, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); + } + + // For globals that require a load from a stub to get the address, emit the + // load. + if (isGlobalStubReference(OpFlags)) + Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); + + // If there was a non-zero offset that we didn't fold, create an explicit + // addition for it. + if (Offset != 0) + Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, + DAG.getConstant(Offset, dl, PtrVT)); + + return Result; +} + +SDValue +X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { + const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); + return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG); +} + +static SDValue +GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, + SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, + unsigned char OperandFlags, bool LocalDynamic = false) { + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + SDLoc dl(GA); + SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, + GA->getValueType(0), + GA->getOffset(), + OperandFlags); + + X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR + : X86ISD::TLSADDR; + + if (InFlag) { + SDValue Ops[] = { Chain, TGA, *InFlag }; + Chain = DAG.getNode(CallType, dl, NodeTys, Ops); + } else { + SDValue Ops[] = { Chain, TGA }; + Chain = DAG.getNode(CallType, dl, NodeTys, Ops); + } + + // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. + MFI->setAdjustsStack(true); + MFI->setHasCalls(true); + + SDValue Flag = Chain.getValue(1); + return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); +} + +// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit +static SDValue +LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, + const EVT PtrVT) { + SDValue InFlag; + SDLoc dl(GA); // ? function entry point might be better + SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, + DAG.getNode(X86ISD::GlobalBaseReg, + SDLoc(), PtrVT), InFlag); + InFlag = Chain.getValue(1); + + return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); +} + +// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit +static SDValue +LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, + const EVT PtrVT) { + return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, + X86::RAX, X86II::MO_TLSGD); +} + +static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, + SelectionDAG &DAG, + const EVT PtrVT, + bool is64Bit) { + SDLoc dl(GA); + + // Get the start address of the TLS block for this module. + X86MachineFunctionInfo* MFI = DAG.getMachineFunction() + .getInfo<X86MachineFunctionInfo>(); + MFI->incNumLocalDynamicTLSAccesses(); + + SDValue Base; + if (is64Bit) { + Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, + X86II::MO_TLSLD, /*LocalDynamic=*/true); + } else { + SDValue InFlag; + SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag); + InFlag = Chain.getValue(1); + Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, + X86II::MO_TLSLDM, /*LocalDynamic=*/true); + } + + // Note: the CleanupLocalDynamicTLSPass will remove redundant computations + // of Base. + + // Build x@dtpoff. + unsigned char OperandFlags = X86II::MO_DTPOFF; + unsigned WrapperKind = X86ISD::Wrapper; + SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, + GA->getValueType(0), + GA->getOffset(), OperandFlags); + SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); + + // Add x@dtpoff with the base. + return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); +} + +// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. +static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, + const EVT PtrVT, TLSModel::Model model, + bool is64Bit, bool isPIC) { + SDLoc dl(GA); + + // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). + Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), + is64Bit ? 257 : 256)); + + SDValue ThreadPointer = + DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl), + MachinePointerInfo(Ptr), false, false, false, 0); + + unsigned char OperandFlags = 0; + // Most TLS accesses are not RIP relative, even on x86-64. One exception is + // initialexec. + unsigned WrapperKind = X86ISD::Wrapper; + if (model == TLSModel::LocalExec) { + OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; + } else if (model == TLSModel::InitialExec) { + if (is64Bit) { + OperandFlags = X86II::MO_GOTTPOFF; + WrapperKind = X86ISD::WrapperRIP; + } else { + OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; + } + } else { + llvm_unreachable("Unexpected model"); + } + + // emit "addl x@ntpoff,%eax" (local exec) + // or "addl x@indntpoff,%eax" (initial exec) + // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) + SDValue TGA = + DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), + GA->getOffset(), OperandFlags); + SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); + + if (model == TLSModel::InitialExec) { + if (isPIC && !is64Bit) { + Offset = DAG.getNode(ISD::ADD, dl, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), + Offset); + } + + Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); + } + + // The address of the thread local variable is the add of the thread + // pointer with the offset of the variable. + return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); +} + +SDValue +X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { + + GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + + // Cygwin uses emutls. + // FIXME: It may be EmulatedTLS-generic also for X86-Android. + if (Subtarget->isTargetWindowsCygwin()) + return LowerToTLSEmulatedModel(GA, DAG); + + const GlobalValue *GV = GA->getGlobal(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + + if (Subtarget->isTargetELF()) { + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); + TLSModel::Model model = DAG.getTarget().getTLSModel(GV); + switch (model) { + case TLSModel::GeneralDynamic: + if (Subtarget->is64Bit()) + return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT); + return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT); + case TLSModel::LocalDynamic: + return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, + Subtarget->is64Bit()); + case TLSModel::InitialExec: + case TLSModel::LocalExec: + return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget->is64Bit(), + DAG.getTarget().getRelocationModel() == + Reloc::PIC_); + } + llvm_unreachable("Unknown TLS model."); + } + + if (Subtarget->isTargetDarwin()) { + // Darwin only has one model of TLS. Lower to that. + unsigned char OpFlag = 0; + unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? + X86ISD::WrapperRIP : X86ISD::Wrapper; + + // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the + // global base reg. + bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) && + !Subtarget->is64Bit(); + if (PIC32) + OpFlag = X86II::MO_TLVP_PIC_BASE; + else + OpFlag = X86II::MO_TLVP; + SDLoc DL(Op); + SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, + GA->getValueType(0), + GA->getOffset(), OpFlag); + SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result); + + // With PIC32, the address is actually $g + Offset. + if (PIC32) + Offset = DAG.getNode(ISD::ADD, DL, PtrVT, + DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), + Offset); + + // Lowering the machine isd will make sure everything is in the right + // location. + SDValue Chain = DAG.getEntryNode(); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Args[] = { Chain, Offset }; + Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); + + // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setAdjustsStack(true); + + // And our return value (tls address) is in the standard call return value + // location. + unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; + return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1)); + } + + if (Subtarget->isTargetKnownWindowsMSVC() || + Subtarget->isTargetWindowsGNU()) { + // Just use the implicit TLS architecture + // Need to generate someting similar to: + // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage + // ; from TEB + // mov ecx, dword [rel _tls_index]: Load index (from C runtime) + // mov rcx, qword [rdx+rcx*8] + // mov eax, .tls$:tlsvar + // [rax+rcx] contains the address + // Windows 64bit: gs:0x58 + // Windows 32bit: fs:__tls_array + + SDLoc dl(GA); + SDValue Chain = DAG.getEntryNode(); + + // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or + // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly + // use its literal value of 0x2C. + Value *Ptr = Constant::getNullValue(Subtarget->is64Bit() + ? Type::getInt8PtrTy(*DAG.getContext(), + 256) + : Type::getInt32PtrTy(*DAG.getContext(), + 257)); + + SDValue TlsArray = Subtarget->is64Bit() + ? DAG.getIntPtrConstant(0x58, dl) + : (Subtarget->isTargetWindowsGNU() + ? DAG.getIntPtrConstant(0x2C, dl) + : DAG.getExternalSymbol("_tls_array", PtrVT)); + + SDValue ThreadPointer = + DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr), false, + false, false, 0); + + SDValue res; + if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) { + res = ThreadPointer; + } else { + // Load the _tls_index variable + SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT); + if (Subtarget->is64Bit()) + IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX, + MachinePointerInfo(), MVT::i32, false, false, + false, 0); + else + IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo(), false, + false, false, 0); + + auto &DL = DAG.getDataLayout(); + SDValue Scale = + DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT); + IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale); + + res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX); + } + + res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo(), false, false, + false, 0); + + // Get the offset of start of .tls section + SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, + GA->getValueType(0), + GA->getOffset(), X86II::MO_SECREL); + SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA); + + // The address of the thread local variable is the add of the thread + // pointer with the offset of the variable. + return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset); + } + + llvm_unreachable("TLS not implemented for this target."); +} + +/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values +/// and take a 2 x i32 value to shift plus a shift amount. +static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + MVT VT = Op.getSimpleValueType(); + unsigned VTBits = VT.getSizeInBits(); + SDLoc dl(Op); + bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the + // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away + // during isel. + SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, + DAG.getConstant(VTBits - 1, dl, MVT::i8)); + SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, + DAG.getConstant(VTBits - 1, dl, MVT::i8)) + : DAG.getConstant(0, dl, VT); + + SDValue Tmp2, Tmp3; + if (Op.getOpcode() == ISD::SHL_PARTS) { + Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); + Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt); + } else { + Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); + Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt); + } + + // If the shift amount is larger or equal than the width of a part we can't + // rely on the results of shld/shrd. Insert a test and select the appropriate + // values for large shift amounts. + SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, + DAG.getConstant(VTBits, dl, MVT::i8)); + SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, + AndNode, DAG.getConstant(0, dl, MVT::i8)); + + SDValue Hi, Lo; + SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); + SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; + SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; + + if (Op.getOpcode() == ISD::SHL_PARTS) { + Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); + Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); + } else { + Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0); + Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1); + } + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); +} + +SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + MVT VT = Op.getSimpleValueType(); + SDLoc dl(Op); + + if (SrcVT.isVector()) { + if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { + return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT, + DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, + DAG.getUNDEF(SrcVT))); + } + if (SrcVT.getVectorElementType() == MVT::i1) { + MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); + return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src)); + } + return SDValue(); + } + + assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && + "Unknown SINT_TO_FP to lower!"); + + // These are really Legal; return the operand so the caller accepts it as + // Legal. + if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) + return Op; + if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && + Subtarget->is64Bit()) { + return Op; + } + + unsigned Size = SrcVT.getSizeInBits()/8; + MachineFunction &MF = DAG.getMachineFunction(); + auto PtrVT = getPointerTy(MF.getDataLayout()); + int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); + SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); + SDValue Chain = DAG.getStore( + DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false, + false, 0); + return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); +} + +SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, + SDValue StackSlot, + SelectionDAG &DAG) const { + // Build the FILD + SDLoc DL(Op); + SDVTList Tys; + bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); + if (useSSE) + Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); + else + Tys = DAG.getVTList(Op.getValueType(), MVT::Other); + + unsigned ByteSize = SrcVT.getSizeInBits()/8; + + FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); + MachineMemOperand *MMO; + if (FI) { + int SSFI = FI->getIndex(); + MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + MachineMemOperand::MOLoad, ByteSize, ByteSize); + } else { + MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); + StackSlot = StackSlot.getOperand(1); + } + SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; + SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : + X86ISD::FILD, DL, + Tys, Ops, SrcVT, MMO); + + if (useSSE) { + Chain = Result.getValue(1); + SDValue InFlag = Result.getValue(2); + + // FIXME: Currently the FST is flagged to the FILD_FLAG. This + // shouldn't be necessary except that RFP cannot be live across + // multiple blocks. When stackifier is fixed, they can be uncoupled. + MachineFunction &MF = DAG.getMachineFunction(); + unsigned SSFISize = Op.getValueType().getSizeInBits()/8; + int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); + auto PtrVT = getPointerTy(MF.getDataLayout()); + SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); + Tys = DAG.getVTList(MVT::Other); + SDValue Ops[] = { + Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag + }; + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + MachineMemOperand::MOStore, SSFISize, SSFISize); + + Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, + Ops, Op.getValueType(), MMO); + Result = DAG.getLoad( + Op.getValueType(), DL, Chain, StackSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + false, false, false, 0); + } + + return Result; +} + +// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. +SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, + SelectionDAG &DAG) const { + // This algorithm is not obvious. Here it is what we're trying to output: + /* + movq %rax, %xmm0 + punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } + subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } + #ifdef __SSE3__ + haddpd %xmm0, %xmm0 + #else + pshufd $0x4e, %xmm0, %xmm1 + addpd %xmm1, %xmm0 + #endif + */ + + SDLoc dl(Op); + LLVMContext *Context = DAG.getContext(); + + // Build some magic constants. + static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; + Constant *C0 = ConstantDataVector::get(*Context, CV0); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16); + + SmallVector<Constant*,2> CV1; + CV1.push_back( + ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, + APInt(64, 0x4330000000000000ULL)))); + CV1.push_back( + ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, + APInt(64, 0x4530000000000000ULL)))); + Constant *C1 = ConstantVector::get(CV1); + SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16); + + // Load the 64-bit value into an XMM register. + SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, + Op.getOperand(0)); + SDValue CLod0 = + DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); + SDValue Unpck1 = + getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0); + + SDValue CLod1 = + DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); + SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); + // TODO: Are there any fast-math-flags to propagate here? + SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); + SDValue Result; + + if (Subtarget->hasSSE3()) { + // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'. + Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); + } else { + SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub); + SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32, + S2F, 0x4E, DAG); + Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, + DAG.getBitcast(MVT::v2f64, Shuffle), Sub); + } + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, + DAG.getIntPtrConstant(0, dl)); +} + +// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. +SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + // FP constant to bias correct the final result. + SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, + MVT::f64); + + // Load the 32-bit value into an XMM register. + SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, + Op.getOperand(0)); + + // Zero out the upper parts of the register. + Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); + + Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, + DAG.getBitcast(MVT::v2f64, Load), + DAG.getIntPtrConstant(0, dl)); + + // Or the load with the bias. + SDValue Or = DAG.getNode( + ISD::OR, dl, MVT::v2i64, + DAG.getBitcast(MVT::v2i64, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)), + DAG.getBitcast(MVT::v2i64, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias))); + Or = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, + DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl)); + + // Subtract the bias. + // TODO: Are there any fast-math-flags to propagate here? + SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); + + // Handle final rounding. + MVT DestVT = Op.getSimpleValueType(); + + if (DestVT.bitsLT(MVT::f64)) + return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, + DAG.getIntPtrConstant(0, dl)); + if (DestVT.bitsGT(MVT::f64)) + return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); + + // Handle final rounding. + return Sub; +} + +static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // The algorithm is the following: + // #ifdef __SSE4_1__ + // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); + // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), + // (uint4) 0x53000000, 0xaa); + // #else + // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; + // uint4 hi = (v >> 16) | (uint4) 0x53000000; + // #endif + // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); + // return (float4) lo + fhi; + + // We shouldn't use it when unsafe-fp-math is enabled though: we might later + // reassociate the two FADDs, and if we do that, the algorithm fails + // spectacularly (PR24512). + // FIXME: If we ever have some kind of Machine FMF, this should be marked + // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because + // there's also the MachineCombiner reassociations happening on Machine IR. + if (DAG.getTarget().Options.UnsafeFPMath) + return SDValue(); + + SDLoc DL(Op); + SDValue V = Op->getOperand(0); + MVT VecIntVT = V.getSimpleValueType(); + bool Is128 = VecIntVT == MVT::v4i32; + MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; + // If we convert to something else than the supported type, e.g., to v4f64, + // abort early. + if (VecFloatVT != Op->getSimpleValueType(0)) + return SDValue(); + + unsigned NumElts = VecIntVT.getVectorNumElements(); + assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && + "Unsupported custom type"); + assert(NumElts <= 8 && "The size of the constant array must be fixed"); + + // In the #idef/#else code, we have in common: + // - The vector of constants: + // -- 0x4b000000 + // -- 0x53000000 + // - A shift: + // -- v >> 16 + + // Create the splat vector for 0x4b000000. + SDValue CstLow = DAG.getConstant(0x4b000000, DL, MVT::i32); + SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow, + CstLow, CstLow, CstLow, CstLow}; + SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, + makeArrayRef(&CstLowArray[0], NumElts)); + // Create the splat vector for 0x53000000. + SDValue CstHigh = DAG.getConstant(0x53000000, DL, MVT::i32); + SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh, + CstHigh, CstHigh, CstHigh, CstHigh}; + SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, + makeArrayRef(&CstHighArray[0], NumElts)); + + // Create the right shift. + SDValue CstShift = DAG.getConstant(16, DL, MVT::i32); + SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift, + CstShift, CstShift, CstShift, CstShift}; + SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, + makeArrayRef(&CstShiftArray[0], NumElts)); + SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift); + + SDValue Low, High; + if (Subtarget.hasSSE41()) { + MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; + // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); + SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow); + SDValue VecBitcast = DAG.getBitcast(VecI16VT, V); + // Low will be bitcasted right away, so do not bother bitcasting back to its + // original type. + Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, + VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); + // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), + // (uint4) 0x53000000, 0xaa); + SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh); + SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift); + // High will be bitcasted right away, so do not bother bitcasting back to + // its original type. + High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, + VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); + } else { + SDValue CstMask = DAG.getConstant(0xffff, DL, MVT::i32); + SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask, + CstMask, CstMask, CstMask); + // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; + SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask); + Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow); + + // uint4 hi = (v >> 16) | (uint4) 0x53000000; + High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh); + } + + // Create the vector constant for -(0x1.0p39f + 0x1.0p23f). + SDValue CstFAdd = DAG.getConstantFP( + APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, MVT::f32); + SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd, + CstFAdd, CstFAdd, CstFAdd, CstFAdd}; + SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT, + makeArrayRef(&CstFAddArray[0], NumElts)); + + // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); + SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); + // TODO: Are there any fast-math-flags to propagate here? + SDValue FHigh = + DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd); + // return (float4) lo + fhi; + SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low); + return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh); +} + +SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, + SelectionDAG &DAG) const { + SDValue N0 = Op.getOperand(0); + MVT SVT = N0.getSimpleValueType(); + SDLoc dl(Op); + + switch (SVT.SimpleTy) { + default: + llvm_unreachable("Custom UINT_TO_FP is not supported!"); + case MVT::v4i8: + case MVT::v4i16: + case MVT::v8i8: + case MVT::v8i16: { + MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); + return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); + } + case MVT::v4i32: + case MVT::v8i32: + return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget); + case MVT::v16i8: + case MVT::v16i16: + assert(Subtarget->hasAVX512()); + return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0)); + } +} + +SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + SDValue N0 = Op.getOperand(0); + SDLoc dl(Op); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + + if (Op.getSimpleValueType().isVector()) + return lowerUINT_TO_FP_vec(Op, DAG); + + // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't + // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform + // the optimization here. + if (DAG.SignBitIsZero(N0)) + return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); + + MVT SrcVT = N0.getSimpleValueType(); + MVT DstVT = Op.getSimpleValueType(); + + if (Subtarget->hasAVX512() && isScalarFPTypeInSSEReg(DstVT) && + (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget->is64Bit()))) { + // Conversions from unsigned i32 to f32/f64 are legal, + // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode. + return Op; + } + + if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) + return LowerUINT_TO_FP_i64(Op, DAG); + if (SrcVT == MVT::i32 && X86ScalarSSEf64) + return LowerUINT_TO_FP_i32(Op, DAG); + if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) + return SDValue(); + + // Make a 64-bit buffer, and use it to build an FILD. + SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); + if (SrcVT == MVT::i32) { + SDValue WordOff = DAG.getConstant(4, dl, PtrVT); + SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, WordOff); + SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), + StackSlot, MachinePointerInfo(), + false, false, 0); + SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32), + OffsetSlot, MachinePointerInfo(), + false, false, 0); + SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); + return Fild; + } + + assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), + StackSlot, MachinePointerInfo(), + false, false, 0); + // For i64 source, we need to add the appropriate power of 2 if the input + // was negative. This is the same as the optimization in + // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, + // we must be careful to do the computation in x87 extended precision, not + // in SSE. (The generic code can't know it's OK to do this, or how to.) + int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + MachineMemOperand::MOLoad, 8, 8); + + SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); + SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; + SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, + MVT::i64, MMO); + + APInt FF(32, 0x5F800000ULL); + + // Check whether the sign bit is set. + SDValue SignSet = DAG.getSetCC( + dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64), + Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); + + // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. + SDValue FudgePtr = DAG.getConstantPool( + ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT); + + // Get a pointer to FF if the sign bit was set, or to 0 otherwise. + SDValue Zero = DAG.getIntPtrConstant(0, dl); + SDValue Four = DAG.getIntPtrConstant(4, dl); + SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, + Zero, Four); + FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset); + + // Load the value out, extending it from f32 to f80. + // FIXME: Avoid the extend by constructing the right constant pool? + SDValue Fudge = DAG.getExtLoad( + ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, + false, false, false, 4); + // Extend everything to 80 bits to force it to be done on x87. + // TODO: Are there any fast-math-flags to propagate here? + SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); + return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, + DAG.getIntPtrConstant(0, dl)); +} + +// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation +// is legal, or has an fp128 or f16 source (which needs to be promoted to f32), +// just return an <SDValue(), SDValue()> pair. +// Otherwise it is assumed to be a conversion from one of f32, f64 or f80 +// to i16, i32 or i64, and we lower it to a legal sequence. +// If lowered to the final integer result we return a <result, SDValue()> pair. +// Otherwise we lower it to a sequence ending with a FIST, return a +// <FIST, StackSlot> pair, and the caller is responsible for loading +// the final integer result from StackSlot. +std::pair<SDValue,SDValue> +X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, + bool IsSigned, bool IsReplace) const { + SDLoc DL(Op); + + EVT DstTy = Op.getValueType(); + EVT TheVT = Op.getOperand(0).getValueType(); + auto PtrVT = getPointerTy(DAG.getDataLayout()); + + if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) { + // f16 must be promoted before using the lowering in this routine. + // fp128 does not use this lowering. + return std::make_pair(SDValue(), SDValue()); + } + + // If using FIST to compute an unsigned i64, we'll need some fixup + // to handle values above the maximum signed i64. A FIST is always + // used for the 32-bit subtarget, but also for f80 on a 64-bit target. + bool UnsignedFixup = !IsSigned && + DstTy == MVT::i64 && + (!Subtarget->is64Bit() || + !isScalarFPTypeInSSEReg(TheVT)); + + if (!IsSigned && DstTy != MVT::i64 && !Subtarget->hasAVX512()) { + // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST. + // The low 32 bits of the fist result will have the correct uint32 result. + assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); + DstTy = MVT::i64; + } + + assert(DstTy.getSimpleVT() <= MVT::i64 && + DstTy.getSimpleVT() >= MVT::i16 && + "Unknown FP_TO_INT to lower!"); + + // These are really Legal. + if (DstTy == MVT::i32 && + isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) + return std::make_pair(SDValue(), SDValue()); + if (Subtarget->is64Bit() && + DstTy == MVT::i64 && + isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) + return std::make_pair(SDValue(), SDValue()); + + // We lower FP->int64 into FISTP64 followed by a load from a temporary + // stack slot. + MachineFunction &MF = DAG.getMachineFunction(); + unsigned MemSize = DstTy.getSizeInBits()/8; + int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); + SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); + + unsigned Opc; + switch (DstTy.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); + case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; + case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; + case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; + } + + SDValue Chain = DAG.getEntryNode(); + SDValue Value = Op.getOperand(0); + SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment. + + if (UnsignedFixup) { + // + // Conversion to unsigned i64 is implemented with a select, + // depending on whether the source value fits in the range + // of a signed i64. Let Thresh be the FP equivalent of + // 0x8000000000000000ULL. + // + // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000; + // FistSrc = (Value < Thresh) ? Value : (Value - Thresh); + // Fist-to-mem64 FistSrc + // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent + // to XOR'ing the high 32 bits with Adjust. + // + // Being a power of 2, Thresh is exactly representable in all FP formats. + // For X87 we'd like to use the smallest FP type for this constant, but + // for DAG type consistency we have to match the FP operand type. + + APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000)); + LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK; + bool LosesInfo = false; + if (TheVT == MVT::f64) + // The rounding mode is irrelevant as the conversion should be exact. + Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, + &LosesInfo); + else if (TheVT == MVT::f80) + Status = Thresh.convert(APFloat::x87DoubleExtended, + APFloat::rmNearestTiesToEven, &LosesInfo); + + assert(Status == APFloat::opOK && !LosesInfo && + "FP conversion should have been exact"); + + SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT); + + SDValue Cmp = DAG.getSetCC(DL, + getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), TheVT), + Value, ThreshVal, ISD::SETLT); + Adjust = DAG.getSelect(DL, MVT::i32, Cmp, + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(0x80000000, DL, MVT::i32)); + SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal); + Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), TheVT), + Value, ThreshVal, ISD::SETLT); + Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub); + } + + // FIXME This causes a redundant load/store if the SSE-class value is already + // in memory, such as if it is on the callstack. + if (isScalarFPTypeInSSEReg(TheVT)) { + assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); + Chain = DAG.getStore(Chain, DL, Value, StackSlot, + MachinePointerInfo::getFixedStack(MF, SSFI), false, + false, 0); + SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); + SDValue Ops[] = { + Chain, StackSlot, DAG.getValueType(TheVT) + }; + + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), + MachineMemOperand::MOLoad, MemSize, MemSize); + Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); + Chain = Value.getValue(1); + SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); + StackSlot = DAG.getFrameIndex(SSFI, PtrVT); + } + + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), + MachineMemOperand::MOStore, MemSize, MemSize); + + if (UnsignedFixup) { + + // Insert the FIST, load its result as two i32's, + // and XOR the high i32 with Adjust. + + SDValue FistOps[] = { Chain, Value, StackSlot }; + SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), + FistOps, DstTy, MMO); + + SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot, + MachinePointerInfo(), + false, false, false, 0); + SDValue HighAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackSlot, + DAG.getConstant(4, DL, PtrVT)); + + SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr, + MachinePointerInfo(), + false, false, false, 0); + High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust); + + if (Subtarget->is64Bit()) { + // Join High32 and Low32 into a 64-bit result. + // (High32 << 32) | Low32 + Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32); + High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32); + High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32, + DAG.getConstant(32, DL, MVT::i8)); + SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32); + return std::make_pair(Result, SDValue()); + } + + SDValue ResultOps[] = { Low32, High32 }; + + SDValue pair = IsReplace + ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps) + : DAG.getMergeValues(ResultOps, DL); + return std::make_pair(pair, SDValue()); + } else { + // Build the FP_TO_INT*_IN_MEM + SDValue Ops[] = { Chain, Value, StackSlot }; + SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), + Ops, DstTy, MMO); + return std::make_pair(FIST, StackSlot); + } +} + +static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + MVT VT = Op->getSimpleValueType(0); + SDValue In = Op->getOperand(0); + MVT InVT = In.getSimpleValueType(); + SDLoc dl(Op); + + if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) + return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In); + + // Optimize vectors in AVX mode: + // + // v8i16 -> v8i32 + // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32. + // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. + // Concat upper and lower parts. + // + // v4i32 -> v4i64 + // Use vpunpckldq for 4 lower elements v4i32 -> v2i64. + // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. + // Concat upper and lower parts. + // + + if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) && + ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) && + ((VT != MVT::v4i64) || (InVT != MVT::v4i32))) + return SDValue(); + + if (Subtarget->hasInt256()) + return DAG.getNode(X86ISD::VZEXT, dl, VT, In); + + SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl); + SDValue Undef = DAG.getUNDEF(InVT); + bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND; + SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); + SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); + + MVT HVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements()/2); + + OpLo = DAG.getBitcast(HVT, OpLo); + OpHi = DAG.getBitcast(HVT, OpHi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); +} + +static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + MVT VT = Op->getSimpleValueType(0); + SDValue In = Op->getOperand(0); + MVT InVT = In.getSimpleValueType(); + SDLoc DL(Op); + unsigned int NumElts = VT.getVectorNumElements(); + if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI()) + return SDValue(); + + if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) + return DAG.getNode(X86ISD::VZEXT, DL, VT, In); + + assert(InVT.getVectorElementType() == MVT::i1); + MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32; + SDValue One = + DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT); + SDValue Zero = + DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT); + + SDValue V = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero); + if (VT.is512BitVector()) + return V; + return DAG.getNode(X86ISD::VTRUNC, DL, VT, V); +} + +static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + if (Subtarget->hasFp256()) + if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) + return Res; + + return SDValue(); +} + +static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + SDValue In = Op.getOperand(0); + MVT SVT = In.getSimpleValueType(); + + if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1) + return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG); + + if (Subtarget->hasFp256()) + if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) + return Res; + + assert(!VT.is256BitVector() || !SVT.is128BitVector() || + VT.getVectorNumElements() != SVT.getVectorNumElements()); + return SDValue(); +} + +static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + SDValue In = Op.getOperand(0); + MVT InVT = In.getSimpleValueType(); + + assert(VT.getVectorElementType() == MVT::i1 && "Unexected vector type."); + + // Shift LSB to MSB and use VPMOVB2M - SKX. + unsigned ShiftInx = InVT.getScalarSizeInBits() - 1; + if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && + Subtarget->hasBWI()) || // legal, will go to VPMOVB2M, VPMOVW2M + ((InVT.is256BitVector() || InVT.is128BitVector()) && + InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI() && + Subtarget->hasVLX())) { // legal, will go to VPMOVB2M, VPMOVW2M + // Shift packed bytes not supported natively, bitcast to dword + MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT, + DAG.getBitcast(ExtVT, In), + DAG.getConstant(ShiftInx, DL, ExtVT)); + ShiftNode = DAG.getBitcast(InVT, ShiftNode); + return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); + } + if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && + Subtarget->hasDQI()) || // legal, will go to VPMOVD2M, VPMOVQ2M + ((InVT.is256BitVector() || InVT.is128BitVector()) && + InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI() && + Subtarget->hasVLX())) { // legal, will go to VPMOVD2M, VPMOVQ2M + + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, + DAG.getConstant(ShiftInx, DL, InVT)); + return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); + } + + // Shift LSB to MSB, extend if necessary and use TESTM. + unsigned NumElts = InVT.getVectorNumElements(); + if (InVT.getSizeInBits() < 512 && + (InVT.getScalarType() == MVT::i8 || InVT.getScalarType() == MVT::i16 || + !Subtarget->hasVLX())) { + assert((NumElts == 8 || NumElts == 16) && "Unexected vector type."); + + // TESTD/Q should be used (if BW supported we use CVT2MASK above), + // so vector should be extended to packed dword/qword. + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); + InVT = ExtVT; + ShiftInx = InVT.getScalarSizeInBits() - 1; + } + + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, + DAG.getConstant(ShiftInx, DL, InVT)); + return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode); +} + +SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + SDValue In = Op.getOperand(0); + MVT InVT = In.getSimpleValueType(); + + if (VT == MVT::i1) { + assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) && + "Invalid scalar TRUNCATE operation"); + if (InVT.getSizeInBits() >= 32) + return SDValue(); + In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In); + return DAG.getNode(ISD::TRUNCATE, DL, VT, In); + } + assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && + "Invalid TRUNCATE operation"); + + if (VT.getVectorElementType() == MVT::i1) + return LowerTruncateVecI1(Op, DAG, Subtarget); + + // vpmovqb/w/d, vpmovdb/w, vpmovwb + if (Subtarget->hasAVX512()) { + // word to byte only under BWI + if (InVT == MVT::v16i16 && !Subtarget->hasBWI()) // v16i16 -> v16i8 + return DAG.getNode(X86ISD::VTRUNC, DL, VT, + DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In)); + return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); + } + if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { + // On AVX2, v4i64 -> v4i32 becomes VPERMD. + if (Subtarget->hasInt256()) { + static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; + In = DAG.getBitcast(MVT::v8i32, In); + In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32), + ShufMask); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, + DAG.getIntPtrConstant(0, DL)); + } + + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, + DAG.getIntPtrConstant(0, DL)); + SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, + DAG.getIntPtrConstant(2, DL)); + OpLo = DAG.getBitcast(MVT::v4i32, OpLo); + OpHi = DAG.getBitcast(MVT::v4i32, OpHi); + static const int ShufMask[] = {0, 2, 4, 6}; + return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask); + } + + if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { + // On AVX2, v8i32 -> v8i16 becomed PSHUFB. + if (Subtarget->hasInt256()) { + In = DAG.getBitcast(MVT::v32i8, In); + + SmallVector<SDValue,32> pshufbMask; + for (unsigned i = 0; i < 2; ++i) { + pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8)); + for (unsigned j = 0; j < 8; ++j) + pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8)); + } + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask); + In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); + In = DAG.getBitcast(MVT::v4i64, In); + + static const int ShufMask[] = {0, 2, -1, -1}; + In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64), + &ShufMask[0]); + In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, + DAG.getIntPtrConstant(0, DL)); + return DAG.getBitcast(VT, In); + } + + SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, + DAG.getIntPtrConstant(0, DL)); + + SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, + DAG.getIntPtrConstant(4, DL)); + + OpLo = DAG.getBitcast(MVT::v16i8, OpLo); + OpHi = DAG.getBitcast(MVT::v16i8, OpHi); + + // The PSHUFB mask: + static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, + -1, -1, -1, -1, -1, -1, -1, -1}; + + SDValue Undef = DAG.getUNDEF(MVT::v16i8); + OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1); + OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1); + + OpLo = DAG.getBitcast(MVT::v4i32, OpLo); + OpHi = DAG.getBitcast(MVT::v4i32, OpHi); + + // The MOVLHPS Mask: + static const int ShufMask2[] = {0, 1, 4, 5}; + SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2); + return DAG.getBitcast(MVT::v8i16, res); + } + + // Handle truncation of V256 to V128 using shuffles. + if (!VT.is128BitVector() || !InVT.is256BitVector()) + return SDValue(); + + assert(Subtarget->hasFp256() && "256-bit vector without AVX!"); + + unsigned NumElems = VT.getVectorNumElements(); + MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2); + + SmallVector<int, 16> MaskVec(NumElems * 2, -1); + // Prepare truncation shuffle mask + for (unsigned i = 0; i != NumElems; ++i) + MaskVec[i] = i * 2; + SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In), + DAG.getUNDEF(NVT), &MaskVec[0]); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, + DAG.getIntPtrConstant(0, DL)); +} + +SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, + SelectionDAG &DAG) const { + assert(!Op.getSimpleValueType().isVector()); + + std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, + /*IsSigned=*/ true, /*IsReplace=*/ false); + SDValue FIST = Vals.first, StackSlot = Vals.second; + // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. + if (!FIST.getNode()) + return Op; + + if (StackSlot.getNode()) + // Load the result. + return DAG.getLoad(Op.getValueType(), SDLoc(Op), + FIST, StackSlot, MachinePointerInfo(), + false, false, false, 0); + + // The node is the result. + return FIST; +} + +SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, + SelectionDAG &DAG) const { + std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, + /*IsSigned=*/ false, /*IsReplace=*/ false); + SDValue FIST = Vals.first, StackSlot = Vals.second; + // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. + if (!FIST.getNode()) + return Op; + + if (StackSlot.getNode()) + // Load the result. + return DAG.getLoad(Op.getValueType(), SDLoc(Op), + FIST, StackSlot, MachinePointerInfo(), + false, false, false, 0); + + // The node is the result. + return FIST; +} + +static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + SDValue In = Op.getOperand(0); + MVT SVT = In.getSimpleValueType(); + + assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); + + return DAG.getNode(X86ISD::VFPEXT, DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, + In, DAG.getUNDEF(SVT))); +} + +/// The only differences between FABS and FNEG are the mask and the logic op. +/// FNEG also has a folding opportunity for FNEG(FABS(x)). +static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { + assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && + "Wrong opcode for lowering FABS or FNEG."); + + bool IsFABS = (Op.getOpcode() == ISD::FABS); + + // If this is a FABS and it has an FNEG user, bail out to fold the combination + // into an FNABS. We'll lower the FABS after that if it is still in use. + if (IsFABS) + for (SDNode *User : Op->uses()) + if (User->getOpcode() == ISD::FNEG) + return Op; + + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + + bool IsF128 = (VT == MVT::f128); + + // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to + // decide if we should generate a 16-byte constant mask when we only need 4 or + // 8 bytes for the scalar case. + + MVT LogicVT; + MVT EltVT; + unsigned NumElts; + + if (VT.isVector()) { + LogicVT = VT; + EltVT = VT.getVectorElementType(); + NumElts = VT.getVectorNumElements(); + } else if (IsF128) { + // SSE instructions are used for optimized f128 logical operations. + LogicVT = MVT::f128; + EltVT = VT; + NumElts = 1; + } else { + // There are no scalar bitwise logical SSE/AVX instructions, so we + // generate a 16-byte vector constant and logic op even for the scalar case. + // Using a 16-byte mask allows folding the load of the mask with + // the logic op, so it can save (~4 bytes) on code size. + LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; + EltVT = VT; + NumElts = (VT == MVT::f64) ? 2 : 4; + } + + unsigned EltBits = EltVT.getSizeInBits(); + LLVMContext *Context = DAG.getContext(); + // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... + APInt MaskElt = + IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits); + Constant *C = ConstantInt::get(*Context, MaskElt); + C = ConstantVector::getSplat(NumElts, C); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); + unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); + SDValue Mask = + DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, Alignment); + + SDValue Op0 = Op.getOperand(0); + bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); + unsigned LogicOp = + IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; + SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; + + if (VT.isVector() || IsF128) + return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); + + // For the scalar case extend to a 128-bit vector, perform the logic op, + // and extract the scalar result back out. + Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand); + SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode, + DAG.getIntPtrConstant(0, dl)); +} + +static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + LLVMContext *Context = DAG.getContext(); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + MVT SrcVT = Op1.getSimpleValueType(); + bool IsF128 = (VT == MVT::f128); + + // If second operand is smaller, extend it first. + if (SrcVT.bitsLT(VT)) { + Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); + SrcVT = VT; + } + // And if it is bigger, shrink it first. + if (SrcVT.bitsGT(VT)) { + Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1, dl)); + SrcVT = VT; + } + + // At this point the operands and the result should have the same + // type, and that won't be f80 since that is not custom lowered. + assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) && + "Unexpected type in LowerFCOPYSIGN"); + + const fltSemantics &Sem = + VT == MVT::f64 ? APFloat::IEEEdouble : + (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle); + const unsigned SizeInBits = VT.getSizeInBits(); + + SmallVector<Constant *, 4> CV( + VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4), + ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0)))); + + // First, clear all bits but the sign bit from the second operand (sign). + CV[0] = ConstantFP::get(*Context, + APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1))); + Constant *C = ConstantVector::get(CV); + auto PtrVT = TLI.getPointerTy(DAG.getDataLayout()); + SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16); + + // Perform all logic operations as 16-byte vectors because there are no + // scalar FP logic instructions in SSE. This allows load folding of the + // constants into the logic instructions. + MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32); + SDValue Mask1 = + DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); + if (!IsF128) + Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); + SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1); + + // Next, clear the sign bit from the first operand (magnitude). + // If it's a constant, we can clear it here. + if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) { + APFloat APF = Op0CN->getValueAPF(); + // If the magnitude is a positive zero, the sign bit alone is enough. + if (APF.isPosZero()) + return IsF128 ? SignBit : + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit, + DAG.getIntPtrConstant(0, dl)); + APF.clearSign(); + CV[0] = ConstantFP::get(*Context, APF); + } else { + CV[0] = ConstantFP::get( + *Context, + APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1))); + } + C = ConstantVector::get(CV); + CPIdx = DAG.getConstantPool(C, PtrVT, 16); + SDValue Val = + DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); + // If the magnitude operand wasn't a constant, we need to AND out the sign. + if (!isa<ConstantFPSDNode>(Op0)) { + if (!IsF128) + Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0); + Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val); + } + // OR the magnitude value with the sign bit. + Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit); + return IsF128 ? Val : + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val, + DAG.getIntPtrConstant(0, dl)); +} + +static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { + SDValue N0 = Op.getOperand(0); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + + // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). + SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, + DAG.getConstant(1, dl, VT)); + return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, dl, VT)); +} + +// Check whether an OR'd tree is PTEST-able. +static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); + + if (!Subtarget->hasSSE41()) + return SDValue(); + + if (!Op->hasOneUse()) + return SDValue(); + + SDNode *N = Op.getNode(); + SDLoc DL(N); + + SmallVector<SDValue, 8> Opnds; + DenseMap<SDValue, unsigned> VecInMap; + SmallVector<SDValue, 8> VecIns; + EVT VT = MVT::Other; + + // Recognize a special case where a vector is casted into wide integer to + // test all 0s. + Opnds.push_back(N->getOperand(0)); + Opnds.push_back(N->getOperand(1)); + + for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { + SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot; + // BFS traverse all OR'd operands. + if (I->getOpcode() == ISD::OR) { + Opnds.push_back(I->getOperand(0)); + Opnds.push_back(I->getOperand(1)); + // Re-evaluate the number of nodes to be traversed. + e += 2; // 2 more nodes (LHS and RHS) are pushed. + continue; + } + + // Quit if a non-EXTRACT_VECTOR_ELT + if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // Quit if without a constant index. + SDValue Idx = I->getOperand(1); + if (!isa<ConstantSDNode>(Idx)) + return SDValue(); + + SDValue ExtractedFromVec = I->getOperand(0); + DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec); + if (M == VecInMap.end()) { + VT = ExtractedFromVec.getValueType(); + // Quit if not 128/256-bit vector. + if (!VT.is128BitVector() && !VT.is256BitVector()) + return SDValue(); + // Quit if not the same type. + if (VecInMap.begin() != VecInMap.end() && + VT != VecInMap.begin()->first.getValueType()) + return SDValue(); + M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; + VecIns.push_back(ExtractedFromVec); + } + M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue(); + } + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Not extracted from 128-/256-bit vector."); + + unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; + + for (DenseMap<SDValue, unsigned>::const_iterator + I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { + // Quit if not all elements are used. + if (I->second != FullMask) + return SDValue(); + } + + MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + + // Cast all vectors into TestVT for PTEST. + for (unsigned i = 0, e = VecIns.size(); i < e; ++i) + VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]); + + // If more than one full vectors are evaluated, OR them first before PTEST. + for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { + // Each iteration will OR 2 nodes and append the result until there is only + // 1 node left, i.e. the final OR'd value of all vectors. + SDValue LHS = VecIns[Slot]; + SDValue RHS = VecIns[Slot + 1]; + VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); + } + + return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, + VecIns.back(), VecIns.back()); +} + +/// \brief return true if \c Op has a use that doesn't just read flags. +static bool hasNonFlagsUse(SDValue Op) { + for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE; + ++UI) { + SDNode *User = *UI; + unsigned UOpNo = UI.getOperandNo(); + if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { + // Look pass truncate. + UOpNo = User->use_begin().getOperandNo(); + User = *User->use_begin(); + } + + if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC && + !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) + return true; + } + return false; +} + +/// Emit nodes that will be selected as "test Op0,Op0", or something +/// equivalent. +SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, + SelectionDAG &DAG) const { + if (Op.getValueType() == MVT::i1) { + SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op); + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp, + DAG.getConstant(0, dl, MVT::i8)); + } + // CF and OF aren't always set the way we want. Determine which + // of these we need. + bool NeedCF = false; + bool NeedOF = false; + switch (X86CC) { + default: break; + case X86::COND_A: case X86::COND_AE: + case X86::COND_B: case X86::COND_BE: + NeedCF = true; + break; + case X86::COND_G: case X86::COND_GE: + case X86::COND_L: case X86::COND_LE: + case X86::COND_O: case X86::COND_NO: { + // Check if we really need to set the + // Overflow flag. If NoSignedWrap is present + // that is not actually needed. + switch (Op->getOpcode()) { + case ISD::ADD: + case ISD::SUB: + case ISD::MUL: + case ISD::SHL: { + const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode()); + if (BinNode->Flags.hasNoSignedWrap()) + break; + } + default: + NeedOF = true; + break; + } + break; + } + } + // See if we can use the EFLAGS value from the operand instead of + // doing a separate TEST. TEST always sets OF and CF to 0, so unless + // we prove that the arithmetic won't overflow, we can't use OF or CF. + if (Op.getResNo() != 0 || NeedOF || NeedCF) { + // Emit a CMP with 0, which is the TEST pattern. + //if (Op.getValueType() == MVT::i1) + // return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op, + // DAG.getConstant(0, MVT::i1)); + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, + DAG.getConstant(0, dl, Op.getValueType())); + } + unsigned Opcode = 0; + unsigned NumOperands = 0; + + // Truncate operations may prevent the merge of the SETCC instruction + // and the arithmetic instruction before it. Attempt to truncate the operands + // of the arithmetic instruction and use a reduced bit-width instruction. + bool NeedTruncation = false; + SDValue ArithOp = Op; + if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) { + SDValue Arith = Op->getOperand(0); + // Both the trunc and the arithmetic op need to have one user each. + if (Arith->hasOneUse()) + switch (Arith.getOpcode()) { + default: break; + case ISD::ADD: + case ISD::SUB: + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + NeedTruncation = true; + ArithOp = Arith; + } + } + } + + // NOTICE: In the code below we use ArithOp to hold the arithmetic operation + // which may be the result of a CAST. We use the variable 'Op', which is the + // non-casted variable when we check for possible users. + switch (ArithOp.getOpcode()) { + case ISD::ADD: + // Due to an isel shortcoming, be conservative if this add is likely to be + // selected as part of a load-modify-store instruction. When the root node + // in a match is a store, isel doesn't know how to remap non-chain non-flag + // uses of other nodes in the match, such as the ADD in this case. This + // leads to the ADD being left around and reselected, with the result being + // two adds in the output. Alas, even if none our users are stores, that + // doesn't prove we're O.K. Ergo, if we have any parents that aren't + // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require + // climbing the DAG back to the root, and it doesn't seem to be worth the + // effort. + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = Op.getNode()->use_end(); UI != UE; ++UI) + if (UI->getOpcode() != ISD::CopyToReg && + UI->getOpcode() != ISD::SETCC && + UI->getOpcode() != ISD::STORE) + goto default_case; + + if (ConstantSDNode *C = + dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { + // An add of one will be selected as an INC. + if (C->isOne() && !Subtarget->slowIncDec()) { + Opcode = X86ISD::INC; + NumOperands = 1; + break; + } + + // An add of negative one (subtract of one) will be selected as a DEC. + if (C->isAllOnesValue() && !Subtarget->slowIncDec()) { + Opcode = X86ISD::DEC; + NumOperands = 1; + break; + } + } + + // Otherwise use a regular EFLAGS-setting add. + Opcode = X86ISD::ADD; + NumOperands = 2; + break; + case ISD::SHL: + case ISD::SRL: + // If we have a constant logical shift that's only used in a comparison + // against zero turn it into an equivalent AND. This allows turning it into + // a TEST instruction later. + if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() && + isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) { + EVT VT = Op.getValueType(); + unsigned BitWidth = VT.getSizeInBits(); + unsigned ShAmt = Op->getConstantOperandVal(1); + if (ShAmt >= BitWidth) // Avoid undefined shifts. + break; + APInt Mask = ArithOp.getOpcode() == ISD::SRL + ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt) + : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt); + if (!Mask.isSignedIntN(32)) // Avoid large immediates. + break; + SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0), + DAG.getConstant(Mask, dl, VT)); + DAG.ReplaceAllUsesWith(Op, New); + Op = New; + } + break; + + case ISD::AND: + // If the primary and result isn't used, don't bother using X86ISD::AND, + // because a TEST instruction will be better. + if (!hasNonFlagsUse(Op)) + break; + // FALL THROUGH + case ISD::SUB: + case ISD::OR: + case ISD::XOR: + // Due to the ISEL shortcoming noted above, be conservative if this op is + // likely to be selected as part of a load-modify-store instruction. + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = Op.getNode()->use_end(); UI != UE; ++UI) + if (UI->getOpcode() == ISD::STORE) + goto default_case; + + // Otherwise use a regular EFLAGS-setting instruction. + switch (ArithOp.getOpcode()) { + default: llvm_unreachable("unexpected operator!"); + case ISD::SUB: Opcode = X86ISD::SUB; break; + case ISD::XOR: Opcode = X86ISD::XOR; break; + case ISD::AND: Opcode = X86ISD::AND; break; + case ISD::OR: { + if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { + SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG); + if (EFLAGS.getNode()) + return EFLAGS; + } + Opcode = X86ISD::OR; + break; + } + } + + NumOperands = 2; + break; + case X86ISD::ADD: + case X86ISD::SUB: + case X86ISD::INC: + case X86ISD::DEC: + case X86ISD::OR: + case X86ISD::XOR: + case X86ISD::AND: + return SDValue(Op.getNode(), 1); + default: + default_case: + break; + } + + // If we found that truncation is beneficial, perform the truncation and + // update 'Op'. + if (NeedTruncation) { + EVT VT = Op.getValueType(); + SDValue WideVal = Op->getOperand(0); + EVT WideVT = WideVal.getValueType(); + unsigned ConvertedOp = 0; + // Use a target machine opcode to prevent further DAGCombine + // optimizations that may separate the arithmetic operations + // from the setcc node. + switch (WideVal.getOpcode()) { + default: break; + case ISD::ADD: ConvertedOp = X86ISD::ADD; break; + case ISD::SUB: ConvertedOp = X86ISD::SUB; break; + case ISD::AND: ConvertedOp = X86ISD::AND; break; + case ISD::OR: ConvertedOp = X86ISD::OR; break; + case ISD::XOR: ConvertedOp = X86ISD::XOR; break; + } + + if (ConvertedOp) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) { + SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0)); + SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1)); + Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1); + } + } + } + + if (Opcode == 0) + // Emit a CMP with 0, which is the TEST pattern. + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, + DAG.getConstant(0, dl, Op.getValueType())); + + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands); + + SDValue New = DAG.getNode(Opcode, dl, VTs, Ops); + DAG.ReplaceAllUsesWith(Op, New); + return SDValue(New.getNode(), 1); +} + +/// Emit nodes that will be selected as "cmp Op0,Op1", or something +/// equivalent. +SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, + SDLoc dl, SelectionDAG &DAG) const { + if (isNullConstant(Op1)) + return EmitTest(Op0, X86CC, dl, DAG); + + assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) && + "Unexpected comparison operation for MVT::i1 operands"); + + if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || + Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { + // Do the comparison at i32 if it's smaller, besides the Atom case. + // This avoids subregister aliasing issues. Keep the smaller reference + // if we're optimizing for size, however, as that'll allow better folding + // of memory operations. + if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && + !DAG.getMachineFunction().getFunction()->optForMinSize() && + !Subtarget->isAtom()) { + unsigned ExtendOp = + isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; + Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0); + Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1); + } + // Use SUB instead of CMP to enable CSE between SUB and CMP. + SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); + SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, + Op0, Op1); + return SDValue(Sub.getNode(), 1); + } + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); +} + +/// Convert a comparison if required by the subtarget. +SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, + SelectionDAG &DAG) const { + // If the subtarget does not support the FUCOMI instruction, floating-point + // comparisons have to be converted. + if (Subtarget->hasCMov() || + Cmp.getOpcode() != X86ISD::CMP || + !Cmp.getOperand(0).getValueType().isFloatingPoint() || + !Cmp.getOperand(1).getValueType().isFloatingPoint()) + return Cmp; + + // The instruction selector will select an FUCOM instruction instead of + // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence + // build an SDNode sequence that transfers the result from FPSW into EFLAGS: + // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) + SDLoc dl(Cmp); + SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); + SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); + SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, + DAG.getConstant(8, dl, MVT::i8)); + SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); + + // Some 64-bit targets lack SAHF support, but they do support FCOMI. + assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"); + return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); +} + +/// The minimum architected relative accuracy is 2^-12. We need one +/// Newton-Raphson step to have a good float result (24 bits of precision). +SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const { + EVT VT = Op.getValueType(); + const char *RecipOp; + + // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. + // TODO: Add support for AVX512 (v16f32). + // It is likely not profitable to do this for f64 because a double-precision + // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 + // instructions: convert to single, rsqrtss, convert back to double, refine + // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA + // along with FMA, this could be a throughput win. + if (VT == MVT::f32 && Subtarget->hasSSE1()) + RecipOp = "sqrtf"; + else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || + (VT == MVT::v8f32 && Subtarget->hasAVX())) + RecipOp = "vec-sqrtf"; + else + return SDValue(); + + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; + if (!Recips.isEnabled(RecipOp)) + return SDValue(); + + RefinementSteps = Recips.getRefinementSteps(RecipOp); + UseOneConstNR = false; + return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); +} + +/// The minimum architected relative accuracy is 2^-12. We need one +/// Newton-Raphson step to have a good float result (24 bits of precision). +SDValue X86TargetLowering::getRecipEstimate(SDValue Op, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const { + EVT VT = Op.getValueType(); + const char *RecipOp; + + // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. + // TODO: Add support for AVX512 (v16f32). + // It is likely not profitable to do this for f64 because a double-precision + // reciprocal estimate with refinement on x86 prior to FMA requires + // 15 instructions: convert to single, rcpss, convert back to double, refine + // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA + // along with FMA, this could be a throughput win. + if (VT == MVT::f32 && Subtarget->hasSSE1()) + RecipOp = "divf"; + else if ((VT == MVT::v4f32 && Subtarget->hasSSE1()) || + (VT == MVT::v8f32 && Subtarget->hasAVX())) + RecipOp = "vec-divf"; + else + return SDValue(); + + TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; + if (!Recips.isEnabled(RecipOp)) + return SDValue(); + + RefinementSteps = Recips.getRefinementSteps(RecipOp); + return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); +} + +/// If we have at least two divisions that use the same divisor, convert to +/// multplication by a reciprocal. This may need to be adjusted for a given +/// CPU if a division's cost is not at least twice the cost of a multiplication. +/// This is because we still need one division to calculate the reciprocal and +/// then we need two multiplies by that reciprocal as replacements for the +/// original divisions. +unsigned X86TargetLowering::combineRepeatedFPDivisors() const { + return 2; +} + +/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node +/// if it's possible. +SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, + SDLoc dl, SelectionDAG &DAG) const { + SDValue Op0 = And.getOperand(0); + SDValue Op1 = And.getOperand(1); + if (Op0.getOpcode() == ISD::TRUNCATE) + Op0 = Op0.getOperand(0); + if (Op1.getOpcode() == ISD::TRUNCATE) + Op1 = Op1.getOperand(0); + + SDValue LHS, RHS; + if (Op1.getOpcode() == ISD::SHL) + std::swap(Op0, Op1); + if (Op0.getOpcode() == ISD::SHL) { + if (isOneConstant(Op0.getOperand(0))) { + // If we looked past a truncate, check that it's only truncating away + // known zeros. + unsigned BitWidth = Op0.getValueSizeInBits(); + unsigned AndBitWidth = And.getValueSizeInBits(); + if (BitWidth > AndBitWidth) { + APInt Zeros, Ones; + DAG.computeKnownBits(Op0, Zeros, Ones); + if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) + return SDValue(); + } + LHS = Op1; + RHS = Op0.getOperand(1); + } + } else if (Op1.getOpcode() == ISD::Constant) { + ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); + uint64_t AndRHSVal = AndRHS->getZExtValue(); + SDValue AndLHS = Op0; + + if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { + LHS = AndLHS.getOperand(0); + RHS = AndLHS.getOperand(1); + } + + // Use BT if the immediate can't be encoded in a TEST instruction. + if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) { + LHS = AndLHS; + RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType()); + } + } + + if (LHS.getNode()) { + // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT + // instruction. Since the shift amount is in-range-or-undefined, we know + // that doing a bittest on the i32 value is ok. We extend to i32 because + // the encoding for the i16 version is larger than the i32 version. + // Also promote i16 to i32 for performance / code size reason. + if (LHS.getValueType() == MVT::i8 || + LHS.getValueType() == MVT::i16) + LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); + + // If the operand types disagree, extend the shift amount to match. Since + // BT ignores high bits (like shifts) we can use anyextend. + if (LHS.getValueType() != RHS.getValueType()) + RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); + + SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); + X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; + return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(Cond, dl, MVT::i8), BT); + } + + return SDValue(); +} + +/// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point +/// mask CMPs. +static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, + SDValue &Op1) { + unsigned SSECC; + bool Swap = false; + + // SSE Condition code mapping: + // 0 - EQ + // 1 - LT + // 2 - LE + // 3 - UNORD + // 4 - NEQ + // 5 - NLT + // 6 - NLE + // 7 - ORD + switch (SetCCOpcode) { + default: llvm_unreachable("Unexpected SETCC condition"); + case ISD::SETOEQ: + case ISD::SETEQ: SSECC = 0; break; + case ISD::SETOGT: + case ISD::SETGT: Swap = true; // Fallthrough + case ISD::SETLT: + case ISD::SETOLT: SSECC = 1; break; + case ISD::SETOGE: + case ISD::SETGE: Swap = true; // Fallthrough + case ISD::SETLE: + case ISD::SETOLE: SSECC = 2; break; + case ISD::SETUO: SSECC = 3; break; + case ISD::SETUNE: + case ISD::SETNE: SSECC = 4; break; + case ISD::SETULE: Swap = true; // Fallthrough + case ISD::SETUGE: SSECC = 5; break; + case ISD::SETULT: Swap = true; // Fallthrough + case ISD::SETUGT: SSECC = 6; break; + case ISD::SETO: SSECC = 7; break; + case ISD::SETUEQ: + case ISD::SETONE: SSECC = 8; break; + } + if (Swap) + std::swap(Op0, Op1); + + return SSECC; +} + +// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 +// ones, and then concatenate the result back. +static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + + assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && + "Unsupported value type for operation"); + + unsigned NumElems = VT.getVectorNumElements(); + SDLoc dl(Op); + SDValue CC = Op.getOperand(2); + + // Extract the LHS vectors + SDValue LHS = Op.getOperand(0); + SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); + SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); + + // Extract the RHS vectors + SDValue RHS = Op.getOperand(1); + SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); + SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); + + // Issue the operation on the smaller types and concatenate the result back + MVT EltVT = VT.getVectorElementType(); + MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); +} + +static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue CC = Op.getOperand(2); + MVT VT = Op.getSimpleValueType(); + SDLoc dl(Op); + + assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 && + "Unexpected type for boolean compare operation"); + ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); + SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0, + DAG.getConstant(-1, dl, VT)); + SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1, + DAG.getConstant(-1, dl, VT)); + switch (SetCCOpcode) { + default: llvm_unreachable("Unexpected SETCC condition"); + case ISD::SETEQ: + // (x == y) -> ~(x ^ y) + return DAG.getNode(ISD::XOR, dl, VT, + DAG.getNode(ISD::XOR, dl, VT, Op0, Op1), + DAG.getConstant(-1, dl, VT)); + case ISD::SETNE: + // (x != y) -> (x ^ y) + return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1); + case ISD::SETUGT: + case ISD::SETGT: + // (x > y) -> (x & ~y) + return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1); + case ISD::SETULT: + case ISD::SETLT: + // (x < y) -> (~x & y) + return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1); + case ISD::SETULE: + case ISD::SETLE: + // (x <= y) -> (~x | y) + return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1); + case ISD::SETUGE: + case ISD::SETGE: + // (x >=y) -> (x | ~y) + return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1); + } +} + +static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue CC = Op.getOperand(2); + MVT VT = Op.getSimpleValueType(); + SDLoc dl(Op); + + assert(Op0.getSimpleValueType().getVectorElementType().getSizeInBits() >= 8 && + Op.getSimpleValueType().getVectorElementType() == MVT::i1 && + "Cannot set masked compare for this operation"); + + ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); + unsigned Opc = 0; + bool Unsigned = false; + bool Swap = false; + unsigned SSECC; + switch (SetCCOpcode) { + default: llvm_unreachable("Unexpected SETCC condition"); + case ISD::SETNE: SSECC = 4; break; + case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break; + case ISD::SETUGT: SSECC = 6; Unsigned = true; break; + case ISD::SETLT: Swap = true; //fall-through + case ISD::SETGT: Opc = X86ISD::PCMPGTM; break; + case ISD::SETULT: SSECC = 1; Unsigned = true; break; + case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT + case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap + case ISD::SETULE: Unsigned = true; //fall-through + case ISD::SETLE: SSECC = 2; break; + } + + if (Swap) + std::swap(Op0, Op1); + if (Opc) + return DAG.getNode(Opc, dl, VT, Op0, Op1); + Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; + return DAG.getNode(Opc, dl, VT, Op0, Op1, + DAG.getConstant(SSECC, dl, MVT::i8)); +} + +/// \brief Try to turn a VSETULT into a VSETULE by modifying its second +/// operand \p Op1. If non-trivial (for example because it's not constant) +/// return an empty value. +static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG) +{ + BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode()); + if (!BV) + return SDValue(); + + MVT VT = Op1.getSimpleValueType(); + MVT EVT = VT.getVectorElementType(); + unsigned n = VT.getVectorNumElements(); + SmallVector<SDValue, 8> ULTOp1; + + for (unsigned i = 0; i < n; ++i) { + ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i)); + if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT) + return SDValue(); + + // Avoid underflow. + APInt Val = Elt->getAPIntValue(); + if (Val == 0) + return SDValue(); + + ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT)); + } + + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1); +} + +static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue CC = Op.getOperand(2); + MVT VT = Op.getSimpleValueType(); + ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); + bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint(); + SDLoc dl(Op); + + if (isFP) { +#ifndef NDEBUG + MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); + assert(EltVT == MVT::f32 || EltVT == MVT::f64); +#endif + + unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1); + unsigned Opc = X86ISD::CMPP; + if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) { + assert(VT.getVectorNumElements() <= 16); + Opc = X86ISD::CMPM; + } + // In the two special cases we can't handle, emit two comparisons. + if (SSECC == 8) { + unsigned CC0, CC1; + unsigned CombineOpc; + if (SetCCOpcode == ISD::SETUEQ) { + CC0 = 3; CC1 = 0; CombineOpc = ISD::OR; + } else { + assert(SetCCOpcode == ISD::SETONE); + CC0 = 7; CC1 = 4; CombineOpc = ISD::AND; + } + + SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, + DAG.getConstant(CC0, dl, MVT::i8)); + SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, + DAG.getConstant(CC1, dl, MVT::i8)); + return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); + } + // Handle all other FP comparisons here. + return DAG.getNode(Opc, dl, VT, Op0, Op1, + DAG.getConstant(SSECC, dl, MVT::i8)); + } + + MVT VTOp0 = Op0.getSimpleValueType(); + assert(VTOp0 == Op1.getSimpleValueType() && + "Expected operands with same type!"); + assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() && + "Invalid number of packed elements for source and destination!"); + + if (VT.is128BitVector() && VTOp0.is256BitVector()) { + // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type + // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the + // legalizer firstly checks if the first operand in input to the setcc has + // a legal type. If so, then it promotes the return type to that same type. + // Otherwise, the return type is promoted to the 'next legal type' which, + // for a vector of MVT::i1 is always a 128-bit integer vector type. + // + // We reach this code only if the following two conditions are met: + // 1. Both return type and operand type have been promoted to wider types + // by the type legalizer. + // 2. The original operand type has been promoted to a 256-bit vector. + // + // Note that condition 2. only applies for AVX targets. + SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode); + return DAG.getZExtOrTrunc(NewOp, dl, VT); + } + + // The non-AVX512 code below works under the assumption that source and + // destination types are the same. + assert((Subtarget->hasAVX512() || (VT == VTOp0)) && + "Value types for source and destination must be the same!"); + + // Break 256-bit integer vector compare into smaller ones. + if (VT.is256BitVector() && !Subtarget->hasInt256()) + return Lower256IntVSETCC(Op, DAG); + + MVT OpVT = Op1.getSimpleValueType(); + if (OpVT.getVectorElementType() == MVT::i1) + return LowerBoolVSETCC_AVX512(Op, DAG); + + bool MaskResult = (VT.getVectorElementType() == MVT::i1); + if (Subtarget->hasAVX512()) { + if (Op1.getSimpleValueType().is512BitVector() || + (Subtarget->hasBWI() && Subtarget->hasVLX()) || + (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) + return LowerIntVSETCC_AVX512(Op, DAG, Subtarget); + + // In AVX-512 architecture setcc returns mask with i1 elements, + // But there is no compare instruction for i8 and i16 elements in KNL. + // We are not talking about 512-bit operands in this case, these + // types are illegal. + if (MaskResult && + (OpVT.getVectorElementType().getSizeInBits() < 32 && + OpVT.getVectorElementType().getSizeInBits() >= 8)) + return DAG.getNode(ISD::TRUNCATE, dl, VT, + DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); + } + + // Lower using XOP integer comparisons. + if ((VT == MVT::v16i8 || VT == MVT::v8i16 || + VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget->hasXOP()) { + // Translate compare code to XOP PCOM compare mode. + unsigned CmpMode = 0; + switch (SetCCOpcode) { + default: llvm_unreachable("Unexpected SETCC condition"); + case ISD::SETULT: + case ISD::SETLT: CmpMode = 0x00; break; + case ISD::SETULE: + case ISD::SETLE: CmpMode = 0x01; break; + case ISD::SETUGT: + case ISD::SETGT: CmpMode = 0x02; break; + case ISD::SETUGE: + case ISD::SETGE: CmpMode = 0x03; break; + case ISD::SETEQ: CmpMode = 0x04; break; + case ISD::SETNE: CmpMode = 0x05; break; + } + + // Are we comparing unsigned or signed integers? + unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode) + ? X86ISD::VPCOMU : X86ISD::VPCOM; + + return DAG.getNode(Opc, dl, VT, Op0, Op1, + DAG.getConstant(CmpMode, dl, MVT::i8)); + } + + // We are handling one of the integer comparisons here. Since SSE only has + // GT and EQ comparisons for integer, swapping operands and multiple + // operations may be required for some comparisons. + unsigned Opc; + bool Swap = false, Invert = false, FlipSigns = false, MinMax = false; + bool Subus = false; + + switch (SetCCOpcode) { + default: llvm_unreachable("Unexpected SETCC condition"); + case ISD::SETNE: Invert = true; + case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break; + case ISD::SETLT: Swap = true; + case ISD::SETGT: Opc = X86ISD::PCMPGT; break; + case ISD::SETGE: Swap = true; + case ISD::SETLE: Opc = X86ISD::PCMPGT; + Invert = true; break; + case ISD::SETULT: Swap = true; + case ISD::SETUGT: Opc = X86ISD::PCMPGT; + FlipSigns = true; break; + case ISD::SETUGE: Swap = true; + case ISD::SETULE: Opc = X86ISD::PCMPGT; + FlipSigns = true; Invert = true; break; + } + + // Special case: Use min/max operations for SETULE/SETUGE + MVT VET = VT.getVectorElementType(); + bool hasMinMax = + (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) + || (Subtarget->hasSSE2() && (VET == MVT::i8)); + + if (hasMinMax) { + switch (SetCCOpcode) { + default: break; + case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break; + case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break; + } + + if (MinMax) { Swap = false; Invert = false; FlipSigns = false; } + } + + bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16); + if (!MinMax && hasSubus) { + // As another special case, use PSUBUS[BW] when it's profitable. E.g. for + // Op0 u<= Op1: + // t = psubus Op0, Op1 + // pcmpeq t, <0..0> + switch (SetCCOpcode) { + default: break; + case ISD::SETULT: { + // If the comparison is against a constant we can turn this into a + // setule. With psubus, setule does not require a swap. This is + // beneficial because the constant in the register is no longer + // destructed as the destination so it can be hoisted out of a loop. + // Only do this pre-AVX since vpcmp* is no longer destructive. + if (Subtarget->hasAVX()) + break; + SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG); + if (ULEOp1.getNode()) { + Op1 = ULEOp1; + Subus = true; Invert = false; Swap = false; + } + break; + } + // Psubus is better than flip-sign because it requires no inversion. + case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break; + case ISD::SETULE: Subus = true; Invert = false; Swap = false; break; + } + + if (Subus) { + Opc = X86ISD::SUBUS; + FlipSigns = false; + } + } + + if (Swap) + std::swap(Op0, Op1); + + // Check that the operation in question is available (most are plain SSE2, + // but PCMPGTQ and PCMPEQQ have different requirements). + if (VT == MVT::v2i64) { + if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) { + assert(Subtarget->hasSSE2() && "Don't know how to lower!"); + + // First cast everything to the right type. + Op0 = DAG.getBitcast(MVT::v4i32, Op0); + Op1 = DAG.getBitcast(MVT::v4i32, Op1); + + // Since SSE has no unsigned integer comparisons, we need to flip the sign + // bits of the inputs before performing those operations. The lower + // compare is always unsigned. + SDValue SB; + if (FlipSigns) { + SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32); + } else { + SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32); + SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32); + SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, + Sign, Zero, Sign, Zero); + } + Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB); + Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB); + + // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2)) + SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); + SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); + + // Create masks for only the low parts/high parts of the 64 bit integers. + static const int MaskHi[] = { 1, 1, 3, 3 }; + static const int MaskLo[] = { 0, 0, 2, 2 }; + SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); + SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); + SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); + + SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo); + Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi); + + if (Invert) + Result = DAG.getNOT(dl, Result, MVT::v4i32); + + return DAG.getBitcast(VT, Result); + } + + if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) { + // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with + // pcmpeqd + pshufd + pand. + assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!"); + + // First cast everything to the right type. + Op0 = DAG.getBitcast(MVT::v4i32, Op0); + Op1 = DAG.getBitcast(MVT::v4i32, Op1); + + // Do the compare. + SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); + + // Make sure the lower and upper halves are both all-ones. + static const int Mask[] = { 1, 0, 3, 2 }; + SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); + Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); + + if (Invert) + Result = DAG.getNOT(dl, Result, MVT::v4i32); + + return DAG.getBitcast(VT, Result); + } + } + + // Since SSE has no unsigned integer comparisons, we need to flip the sign + // bits of the inputs before performing those operations. + if (FlipSigns) { + MVT EltVT = VT.getVectorElementType(); + SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl, + VT); + Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); + Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB); + } + + SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); + + // If the logical-not of the result is required, perform that now. + if (Invert) + Result = DAG.getNOT(dl, Result, VT); + + if (MinMax) + Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); + + if (Subus) + Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result, + getZeroVector(VT, Subtarget, DAG, dl)); + + return Result; +} + +SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { + + MVT VT = Op.getSimpleValueType(); + + if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); + + assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1)) + && "SetCC type must be 8-bit or 1-bit integer"); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDLoc dl(Op); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + + // Optimize to BT if possible. + // Lower (X & (1 << N)) == 0 to BT(X, N). + // Lower ((X >>u N) & 1) != 0 to BT(X, N). + // Lower ((X >>s N) & 1) != 0 to BT(X, N). + if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && + isNullConstant(Op1) && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { + if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) { + if (VT == MVT::i1) + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC); + return NewSetCC; + } + } + + // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of + // these. + if ((isOneConstant(Op1) || isNullConstant(Op1)) && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { + + // If the input is a setcc, then reuse the input setcc or use a new one with + // the inverted condition. + if (Op0.getOpcode() == X86ISD::SETCC) { + X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); + bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1); + if (!Invert) + return Op0; + + CCode = X86::GetOppositeBranchCondition(CCode); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(CCode, dl, MVT::i8), + Op0.getOperand(1)); + if (VT == MVT::i1) + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); + return SetCC; + } + } + if ((Op0.getValueType() == MVT::i1) && isOneConstant(Op1) && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { + + ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true); + return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC); + } + + bool isFP = Op1.getSimpleValueType().isFloatingPoint(); + unsigned X86CC = TranslateX86CC(CC, dl, isFP, Op0, Op1, DAG); + if (X86CC == X86::COND_INVALID) + return SDValue(); + + SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG); + EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS); + if (VT == MVT::i1) + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); + return SetCC; +} + +SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue Carry = Op.getOperand(2); + SDValue Cond = Op.getOperand(3); + SDLoc DL(Op); + + assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only."); + X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get()); + + assert(Carry.getOpcode() != ISD::CARRY_FALSE); + SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); + SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry); + return DAG.getNode(X86ISD::SETCC, DL, Op.getValueType(), + DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); +} + +// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. +static bool isX86LogicalCmp(SDValue Op) { + unsigned Opc = Op.getNode()->getOpcode(); + if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || + Opc == X86ISD::SAHF) + return true; + if (Op.getResNo() == 1 && + (Opc == X86ISD::ADD || + Opc == X86ISD::SUB || + Opc == X86ISD::ADC || + Opc == X86ISD::SBB || + Opc == X86ISD::SMUL || + Opc == X86ISD::UMUL || + Opc == X86ISD::INC || + Opc == X86ISD::DEC || + Opc == X86ISD::OR || + Opc == X86ISD::XOR || + Opc == X86ISD::AND)) + return true; + + if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) + return true; + + return false; +} + +static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { + if (V.getOpcode() != ISD::TRUNCATE) + return false; + + SDValue VOp0 = V.getOperand(0); + unsigned InBits = VOp0.getValueSizeInBits(); + unsigned Bits = V.getValueSizeInBits(); + return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); +} + +SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { + bool addTest = true; + SDValue Cond = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue Op2 = Op.getOperand(2); + SDLoc DL(Op); + MVT VT = Op1.getSimpleValueType(); + SDValue CC; + + // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops + // are available or VBLENDV if AVX is available. + // Otherwise FP cmovs get lowered into a less efficient branch sequence later. + if (Cond.getOpcode() == ISD::SETCC && + ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || + (Subtarget->hasSSE1() && VT == MVT::f32)) && + VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) { + SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); + int SSECC = translateX86FSETCC( + cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); + + if (SSECC != 8) { + if (Subtarget->hasAVX512()) { + SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1, + DAG.getConstant(SSECC, DL, MVT::i8)); + return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2); + } + + SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, + DAG.getConstant(SSECC, DL, MVT::i8)); + + // If we have AVX, we can use a variable vector select (VBLENDV) instead + // of 3 logic instructions for size savings and potentially speed. + // Unfortunately, there is no scalar form of VBLENDV. + + // If either operand is a constant, don't try this. We can expect to + // optimize away at least one of the logic instructions later in that + // case, so that sequence would be faster than a variable blend. + + // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly + // uses XMM0 as the selection register. That may need just as many + // instructions as the AND/ANDN/OR sequence due to register moves, so + // don't bother. + + if (Subtarget->hasAVX() && + !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) { + + // Convert to vectors, do a VSELECT, and convert back to scalar. + // All of the conversions should be optimized away. + + MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; + SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1); + SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2); + SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp); + + MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; + VCmp = DAG.getBitcast(VCmpVT, VCmp); + + SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, + VSel, DAG.getIntPtrConstant(0, DL)); + } + SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); + SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); + return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); + } + } + + if (VT.isVector() && VT.getVectorElementType() == MVT::i1) { + SDValue Op1Scalar; + if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) + Op1Scalar = ConvertI1VectorToInteger(Op1, DAG); + else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0)) + Op1Scalar = Op1.getOperand(0); + SDValue Op2Scalar; + if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode())) + Op2Scalar = ConvertI1VectorToInteger(Op2, DAG); + else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) + Op2Scalar = Op2.getOperand(0); + if (Op1Scalar.getNode() && Op2Scalar.getNode()) { + SDValue newSelect = DAG.getNode(ISD::SELECT, DL, + Op1Scalar.getValueType(), + Cond, Op1Scalar, Op2Scalar); + if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) + return DAG.getBitcast(VT, newSelect); + SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec, + DAG.getIntPtrConstant(0, DL)); + } + } + + if (VT == MVT::v4i1 || VT == MVT::v2i1) { + SDValue zeroConst = DAG.getIntPtrConstant(0, DL); + Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, + DAG.getUNDEF(MVT::v8i1), Op1, zeroConst); + Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, + DAG.getUNDEF(MVT::v8i1), Op2, zeroConst); + SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1, + Cond, Op1, Op2); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst); + } + + if (Cond.getOpcode() == ISD::SETCC) { + SDValue NewCond = LowerSETCC(Cond, DAG); + if (NewCond.getNode()) + Cond = NewCond; + } + + // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y + // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y + // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y + // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y + if (Cond.getOpcode() == X86ISD::SETCC && + Cond.getOperand(1).getOpcode() == X86ISD::CMP && + isNullConstant(Cond.getOperand(1).getOperand(1))) { + SDValue Cmp = Cond.getOperand(1); + + unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); + + if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && + (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { + SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2; + + SDValue CmpOp0 = Cmp.getOperand(0); + // Apply further optimizations for special cases + // (select (x != 0), -1, 0) -> neg & sbb + // (select (x == 0), 0, -1) -> neg & sbb + if (isNullConstant(Y) && + (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { + SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); + SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, + DAG.getConstant(0, DL, + CmpOp0.getValueType()), + CmpOp0); + SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), + DAG.getConstant(X86::COND_B, DL, MVT::i8), + SDValue(Neg.getNode(), 1)); + return Res; + } + + Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, + CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType())); + Cmp = ConvertCmpIfNecessary(Cmp, DAG); + + SDValue Res = // Res = 0 or -1. + DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), + DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp); + + if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E)) + Res = DAG.getNOT(DL, Res, Res.getValueType()); + + if (!isNullConstant(Op2)) + Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); + return Res; + } + } + + // Look past (and (setcc_carry (cmp ...)), 1). + if (Cond.getOpcode() == ISD::AND && + Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && + isOneConstant(Cond.getOperand(1))) + Cond = Cond.getOperand(0); + + // If condition flag is set by a X86ISD::CMP, then use it as the condition + // setting operand in place of the X86ISD::SETCC. + unsigned CondOpcode = Cond.getOpcode(); + if (CondOpcode == X86ISD::SETCC || + CondOpcode == X86ISD::SETCC_CARRY) { + CC = Cond.getOperand(0); + + SDValue Cmp = Cond.getOperand(1); + unsigned Opc = Cmp.getOpcode(); + MVT VT = Op.getSimpleValueType(); + + bool IllegalFPCMov = false; + if (VT.isFloatingPoint() && !VT.isVector() && + !isScalarFPTypeInSSEReg(VT)) // FPStack? + IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); + + if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || + Opc == X86ISD::BT) { // FIXME + Cond = Cmp; + addTest = false; + } + } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || + CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || + ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && + Cond.getOperand(0).getValueType() != MVT::i8)) { + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + unsigned X86Opcode; + unsigned X86Cond; + SDVTList VTs; + switch (CondOpcode) { + case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; + case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; + case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; + case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; + case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; + case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; + default: llvm_unreachable("unexpected overflowing operator"); + } + if (CondOpcode == ISD::UMULO) + VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), + MVT::i32); + else + VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); + + SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS); + + if (CondOpcode == ISD::UMULO) + Cond = X86Op.getValue(2); + else + Cond = X86Op.getValue(1); + + CC = DAG.getConstant(X86Cond, DL, MVT::i8); + addTest = false; + } + + if (addTest) { + // Look past the truncate if the high bits are known zero. + if (isTruncWithZeroHighBitsInput(Cond, DAG)) + Cond = Cond.getOperand(0); + + // We know the result of AND is compared against zero. Try to match + // it to BT. + if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { + if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) { + CC = NewSetCC.getOperand(0); + Cond = NewSetCC.getOperand(1); + addTest = false; + } + } + } + + if (addTest) { + CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8); + Cond = EmitTest(Cond, X86::COND_NE, DL, DAG); + } + + // a < b ? -1 : 0 -> RES = ~setcc_carry + // a < b ? 0 : -1 -> RES = setcc_carry + // a >= b ? -1 : 0 -> RES = setcc_carry + // a >= b ? 0 : -1 -> RES = ~setcc_carry + if (Cond.getOpcode() == X86ISD::SUB) { + Cond = ConvertCmpIfNecessary(Cond, DAG); + unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); + + if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && + (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && + (isNullConstant(Op1) || isNullConstant(Op2))) { + SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), + DAG.getConstant(X86::COND_B, DL, MVT::i8), + Cond); + if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B)) + return DAG.getNOT(DL, Res, Res.getValueType()); + return Res; + } + } + + // X86 doesn't have an i8 cmov. If both operands are the result of a truncate + // widen the cmov and push the truncate through. This avoids introducing a new + // branch during isel and doesn't add any extensions. + if (Op.getValueType() == MVT::i8 && + Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { + SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); + if (T1.getValueType() == T2.getValueType() && + // Blacklist CopyFromReg to avoid partial register stalls. + T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ + SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue); + SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond); + return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); + } + } + + // X86ISD::CMOV means set the result (which is operand 1) to the RHS if + // condition is true. + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); + SDValue Ops[] = { Op2, Op1, CC, Cond }; + return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops); +} + +static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op->getSimpleValueType(0); + SDValue In = Op->getOperand(0); + MVT InVT = In.getSimpleValueType(); + MVT VTElt = VT.getVectorElementType(); + MVT InVTElt = InVT.getVectorElementType(); + SDLoc dl(Op); + + // SKX processor + if ((InVTElt == MVT::i1) && + (((Subtarget->hasBWI() && Subtarget->hasVLX() && + VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) || + + ((Subtarget->hasBWI() && VT.is512BitVector() && + VTElt.getSizeInBits() <= 16)) || + + ((Subtarget->hasDQI() && Subtarget->hasVLX() && + VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) || + + ((Subtarget->hasDQI() && VT.is512BitVector() && + VTElt.getSizeInBits() >= 32)))) + return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + + unsigned int NumElts = VT.getVectorNumElements(); + + if (NumElts != 8 && NumElts != 16 && !Subtarget->hasBWI()) + return SDValue(); + + if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) { + if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT) + return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0)); + return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + } + + assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); + MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32; + SDValue NegOne = + DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl, + ExtVT); + SDValue Zero = + DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT); + + SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero); + if (VT.is512BitVector()) + return V; + return DAG.getNode(X86ISD::VTRUNC, dl, VT, V); +} + +static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDValue In = Op->getOperand(0); + MVT VT = Op->getSimpleValueType(0); + MVT InVT = In.getSimpleValueType(); + assert(VT.getSizeInBits() == InVT.getSizeInBits()); + + MVT InSVT = InVT.getVectorElementType(); + assert(VT.getVectorElementType().getSizeInBits() > InSVT.getSizeInBits()); + + if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) + return SDValue(); + if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) + return SDValue(); + + SDLoc dl(Op); + + // SSE41 targets can use the pmovsx* instructions directly. + if (Subtarget->hasSSE41()) + return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + + // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI. + SDValue Curr = In; + MVT CurrVT = InVT; + + // As SRAI is only available on i16/i32 types, we expand only up to i32 + // and handle i64 separately. + while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) { + Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr); + MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2); + CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2); + Curr = DAG.getBitcast(CurrVT, Curr); + } + + SDValue SignExt = Curr; + if (CurrVT != InVT) { + unsigned SignExtShift = + CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits(); + SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, + DAG.getConstant(SignExtShift, dl, MVT::i8)); + } + + if (CurrVT == VT) + return SignExt; + + if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) { + SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, + DAG.getConstant(31, dl, MVT::i8)); + SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5}); + return DAG.getBitcast(VT, Ext); + } + + return SDValue(); +} + +static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op->getSimpleValueType(0); + SDValue In = Op->getOperand(0); + MVT InVT = In.getSimpleValueType(); + SDLoc dl(Op); + + if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) + return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG); + + if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && + (VT != MVT::v8i32 || InVT != MVT::v8i16) && + (VT != MVT::v16i16 || InVT != MVT::v16i8)) + return SDValue(); + + if (Subtarget->hasInt256()) + return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + + // Optimize vectors in AVX mode + // Sign extend v8i16 to v8i32 and + // v4i32 to v4i64 + // + // Divide input vector into two parts + // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1} + // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 + // concat the vectors to original VT + + unsigned NumElems = InVT.getVectorNumElements(); + SDValue Undef = DAG.getUNDEF(InVT); + + SmallVector<int,8> ShufMask1(NumElems, -1); + for (unsigned i = 0; i != NumElems/2; ++i) + ShufMask1[i] = i; + + SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]); + + SmallVector<int,8> ShufMask2(NumElems, -1); + for (unsigned i = 0; i != NumElems/2; ++i) + ShufMask2[i] = i + NumElems/2; + + SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]); + + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements()/2); + + OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo); + OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); +} + +// Lower vector extended loads using a shuffle. If SSSE3 is not available we +// may emit an illegal shuffle but the expansion is still better than scalar +// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise +// we'll emit a shuffle and a arithmetic shift. +// FIXME: Is the expansion actually better than scalar code? It doesn't seem so. +// TODO: It is possible to support ZExt by zeroing the undef values during +// the shuffle phase or after the shuffle. +static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT RegVT = Op.getSimpleValueType(); + assert(RegVT.isVector() && "We only custom lower vector sext loads."); + assert(RegVT.isInteger() && + "We only custom lower integer vector sext loads."); + + // Nothing useful we can do without SSE2 shuffles. + assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2."); + + LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode()); + SDLoc dl(Ld); + EVT MemVT = Ld->getMemoryVT(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned RegSz = RegVT.getSizeInBits(); + + ISD::LoadExtType Ext = Ld->getExtensionType(); + + assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) + && "Only anyext and sext are currently implemented."); + assert(MemVT != RegVT && "Cannot extend to the same type"); + assert(MemVT.isVector() && "Must load a vector from memory"); + + unsigned NumElems = RegVT.getVectorNumElements(); + unsigned MemSz = MemVT.getSizeInBits(); + assert(RegSz > MemSz && "Register size must be greater than the mem size"); + + if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) { + // The only way in which we have a legal 256-bit vector result but not the + // integer 256-bit operations needed to directly lower a sextload is if we + // have AVX1 but not AVX2. In that case, we can always emit a sextload to + // a 128-bit vector and a normal sign_extend to 256-bits that should get + // correctly legalized. We do this late to allow the canonical form of + // sextload to persist throughout the rest of the DAG combiner -- it wants + // to fold together any extensions it can, and so will fuse a sign_extend + // of an sextload into a sextload targeting a wider value. + SDValue Load; + if (MemSz == 128) { + // Just switch this to a normal load. + assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, " + "it must be a legal 128-bit vector " + "type!"); + Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), + Ld->isInvariant(), Ld->getAlignment()); + } else { + assert(MemSz < 128 && + "Can't extend a type wider than 128 bits to a 256 bit vector!"); + // Do an sext load to a 128-bit vector type. We want to use the same + // number of elements, but elements half as wide. This will end up being + // recursively lowered by this routine, but will succeed as we definitely + // have all the necessary features if we're using AVX1. + EVT HalfEltVT = + EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2); + EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems); + Load = + DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), MemVT, Ld->isVolatile(), + Ld->isNonTemporal(), Ld->isInvariant(), + Ld->getAlignment()); + } + + // Replace chain users with the new chain. + assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); + + // Finally, do a normal sign-extend to the desired register. + return DAG.getSExtOrTrunc(Load, dl, RegVT); + } + + // All sizes must be a power of two. + assert(isPowerOf2_32(RegSz * MemSz * NumElems) && + "Non-power-of-two elements are not custom lowered!"); + + // Attempt to load the original value using scalar loads. + // Find the largest scalar type that divides the total loaded size. + MVT SclrLoadTy = MVT::i8; + for (MVT Tp : MVT::integer_valuetypes()) { + if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { + SclrLoadTy = Tp; + } + } + + // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. + if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && + (64 <= MemSz)) + SclrLoadTy = MVT::f64; + + // Calculate the number of scalar loads that we need to perform + // in order to load our vector from memory. + unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); + + assert((Ext != ISD::SEXTLOAD || NumLoads == 1) && + "Can only lower sext loads with a single scalar load!"); + + unsigned loadRegZize = RegSz; + if (Ext == ISD::SEXTLOAD && RegSz >= 256) + loadRegZize = 128; + + // Represent our vector as a sequence of elements which are the + // largest scalar that we can load. + EVT LoadUnitVecVT = EVT::getVectorVT( + *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits()); + + // Represent the data using the same element type that is stored in + // memory. In practice, we ''widen'' MemVT. + EVT WideVecVT = + EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), + loadRegZize / MemVT.getScalarSizeInBits()); + + assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && + "Invalid vector type"); + + // We can't shuffle using an illegal type. + assert(TLI.isTypeLegal(WideVecVT) && + "We only lower types that form legal widened vector types"); + + SmallVector<SDValue, 8> Chains; + SDValue Ptr = Ld->getBasePtr(); + SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl, + TLI.getPointerTy(DAG.getDataLayout())); + SDValue Res = DAG.getUNDEF(LoadUnitVecVT); + + for (unsigned i = 0; i < NumLoads; ++i) { + // Perform a single load. + SDValue ScalarLoad = + DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), + Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), + Ld->getAlignment()); + Chains.push_back(ScalarLoad.getValue(1)); + // Create the first element type using SCALAR_TO_VECTOR in order to avoid + // another round of DAGCombining. + if (i == 0) + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); + else + Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, + ScalarLoad, DAG.getIntPtrConstant(i, dl)); + + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + } + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + + // Bitcast the loaded value to a vector of the original element type, in + // the size of the target vector type. + SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res); + unsigned SizeRatio = RegSz / MemSz; + + if (Ext == ISD::SEXTLOAD) { + // If we have SSE4.1, we can directly emit a VSEXT node. + if (Subtarget->hasSSE41()) { + SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); + return Sext; + } + + // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest + // lanes. + assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) && + "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!"); + + SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); + return Shuff; + } + + // Redistribute the loaded elements into the different locations. + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i * SizeRatio] = i; + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, + DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); + + // Bitcast to the requested type. + Shuff = DAG.getBitcast(RegVT, Shuff); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); + return Shuff; +} + +// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or +// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart +// from the AND / OR. +static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { + Opc = Op.getOpcode(); + if (Opc != ISD::OR && Opc != ISD::AND) + return false; + return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && + Op.getOperand(0).hasOneUse() && + Op.getOperand(1).getOpcode() == X86ISD::SETCC && + Op.getOperand(1).hasOneUse()); +} + +// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and +// 1 and that the SETCC node has a single use. +static bool isXor1OfSetCC(SDValue Op) { + if (Op.getOpcode() != ISD::XOR) + return false; + if (isOneConstant(Op.getOperand(1))) + return Op.getOperand(0).getOpcode() == X86ISD::SETCC && + Op.getOperand(0).hasOneUse(); + return false; +} + +SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { + bool addTest = true; + SDValue Chain = Op.getOperand(0); + SDValue Cond = Op.getOperand(1); + SDValue Dest = Op.getOperand(2); + SDLoc dl(Op); + SDValue CC; + bool Inverted = false; + + if (Cond.getOpcode() == ISD::SETCC) { + // Check for setcc([su]{add,sub,mul}o == 0). + if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && + isNullConstant(Cond.getOperand(1)) && + Cond.getOperand(0).getResNo() == 1 && + (Cond.getOperand(0).getOpcode() == ISD::SADDO || + Cond.getOperand(0).getOpcode() == ISD::UADDO || + Cond.getOperand(0).getOpcode() == ISD::SSUBO || + Cond.getOperand(0).getOpcode() == ISD::USUBO || + Cond.getOperand(0).getOpcode() == ISD::SMULO || + Cond.getOperand(0).getOpcode() == ISD::UMULO)) { + Inverted = true; + Cond = Cond.getOperand(0); + } else { + SDValue NewCond = LowerSETCC(Cond, DAG); + if (NewCond.getNode()) + Cond = NewCond; + } + } +#if 0 + // FIXME: LowerXALUO doesn't handle these!! + else if (Cond.getOpcode() == X86ISD::ADD || + Cond.getOpcode() == X86ISD::SUB || + Cond.getOpcode() == X86ISD::SMUL || + Cond.getOpcode() == X86ISD::UMUL) + Cond = LowerXALUO(Cond, DAG); +#endif + + // Look pass (and (setcc_carry (cmp ...)), 1). + if (Cond.getOpcode() == ISD::AND && + Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && + isOneConstant(Cond.getOperand(1))) + Cond = Cond.getOperand(0); + + // If condition flag is set by a X86ISD::CMP, then use it as the condition + // setting operand in place of the X86ISD::SETCC. + unsigned CondOpcode = Cond.getOpcode(); + if (CondOpcode == X86ISD::SETCC || + CondOpcode == X86ISD::SETCC_CARRY) { + CC = Cond.getOperand(0); + + SDValue Cmp = Cond.getOperand(1); + unsigned Opc = Cmp.getOpcode(); + // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? + if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { + Cond = Cmp; + addTest = false; + } else { + switch (cast<ConstantSDNode>(CC)->getZExtValue()) { + default: break; + case X86::COND_O: + case X86::COND_B: + // These can only come from an arithmetic instruction with overflow, + // e.g. SADDO, UADDO. + Cond = Cond.getNode()->getOperand(1); + addTest = false; + break; + } + } + } + CondOpcode = Cond.getOpcode(); + if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || + CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || + ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) && + Cond.getOperand(0).getValueType() != MVT::i8)) { + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + unsigned X86Opcode; + unsigned X86Cond; + SDVTList VTs; + // Keep this in sync with LowerXALUO, otherwise we might create redundant + // instructions that can't be removed afterwards (i.e. X86ISD::ADD and + // X86ISD::INC). + switch (CondOpcode) { + case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; + case ISD::SADDO: + if (isOneConstant(RHS)) { + X86Opcode = X86ISD::INC; X86Cond = X86::COND_O; + break; + } + X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; + case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; + case ISD::SSUBO: + if (isOneConstant(RHS)) { + X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O; + break; + } + X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break; + case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break; + case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break; + default: llvm_unreachable("unexpected overflowing operator"); + } + if (Inverted) + X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond); + if (CondOpcode == ISD::UMULO) + VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), + MVT::i32); + else + VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); + + SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS); + + if (CondOpcode == ISD::UMULO) + Cond = X86Op.getValue(2); + else + Cond = X86Op.getValue(1); + + CC = DAG.getConstant(X86Cond, dl, MVT::i8); + addTest = false; + } else { + unsigned CondOpc; + if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { + SDValue Cmp = Cond.getOperand(0).getOperand(1); + if (CondOpc == ISD::OR) { + // Also, recognize the pattern generated by an FCMP_UNE. We can emit + // two branches instead of an explicit OR instruction with a + // separate test. + if (Cmp == Cond.getOperand(1).getOperand(1) && + isX86LogicalCmp(Cmp)) { + CC = Cond.getOperand(0).getOperand(0); + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cmp); + CC = Cond.getOperand(1).getOperand(0); + Cond = Cmp; + addTest = false; + } + } else { // ISD::AND + // Also, recognize the pattern generated by an FCMP_OEQ. We can emit + // two branches instead of an explicit AND instruction with a + // separate test. However, we only do this if this block doesn't + // have a fall-through edge, because this requires an explicit + // jmp when the condition is false. + if (Cmp == Cond.getOperand(1).getOperand(1) && + isX86LogicalCmp(Cmp) && + Op.getNode()->hasOneUse()) { + X86::CondCode CCode = + (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); + CCode = X86::GetOppositeBranchCondition(CCode); + CC = DAG.getConstant(CCode, dl, MVT::i8); + SDNode *User = *Op.getNode()->use_begin(); + // Look for an unconditional branch following this conditional branch. + // We need this because we need to reverse the successors in order + // to implement FCMP_OEQ. + if (User->getOpcode() == ISD::BR) { + SDValue FalseBB = User->getOperand(1); + SDNode *NewBR = + DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); + assert(NewBR == User); + (void)NewBR; + Dest = FalseBB; + + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cmp); + X86::CondCode CCode = + (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); + CCode = X86::GetOppositeBranchCondition(CCode); + CC = DAG.getConstant(CCode, dl, MVT::i8); + Cond = Cmp; + addTest = false; + } + } + } + } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { + // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. + // It should be transformed during dag combiner except when the condition + // is set by a arithmetics with overflow node. + X86::CondCode CCode = + (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); + CCode = X86::GetOppositeBranchCondition(CCode); + CC = DAG.getConstant(CCode, dl, MVT::i8); + Cond = Cond.getOperand(0).getOperand(1); + addTest = false; + } else if (Cond.getOpcode() == ISD::SETCC && + cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) { + // For FCMP_OEQ, we can emit + // two branches instead of an explicit AND instruction with a + // separate test. However, we only do this if this block doesn't + // have a fall-through edge, because this requires an explicit + // jmp when the condition is false. + if (Op.getNode()->hasOneUse()) { + SDNode *User = *Op.getNode()->use_begin(); + // Look for an unconditional branch following this conditional branch. + // We need this because we need to reverse the successors in order + // to implement FCMP_OEQ. + if (User->getOpcode() == ISD::BR) { + SDValue FalseBB = User->getOperand(1); + SDNode *NewBR = + DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); + assert(NewBR == User); + (void)NewBR; + Dest = FalseBB; + + SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, + Cond.getOperand(0), Cond.getOperand(1)); + Cmp = ConvertCmpIfNecessary(Cmp, DAG); + CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cmp); + CC = DAG.getConstant(X86::COND_P, dl, MVT::i8); + Cond = Cmp; + addTest = false; + } + } + } else if (Cond.getOpcode() == ISD::SETCC && + cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) { + // For FCMP_UNE, we can emit + // two branches instead of an explicit AND instruction with a + // separate test. However, we only do this if this block doesn't + // have a fall-through edge, because this requires an explicit + // jmp when the condition is false. + if (Op.getNode()->hasOneUse()) { + SDNode *User = *Op.getNode()->use_begin(); + // Look for an unconditional branch following this conditional branch. + // We need this because we need to reverse the successors in order + // to implement FCMP_UNE. + if (User->getOpcode() == ISD::BR) { + SDValue FalseBB = User->getOperand(1); + SDNode *NewBR = + DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); + assert(NewBR == User); + (void)NewBR; + + SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, + Cond.getOperand(0), Cond.getOperand(1)); + Cmp = ConvertCmpIfNecessary(Cmp, DAG); + CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cmp); + CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8); + Cond = Cmp; + addTest = false; + Dest = FalseBB; + } + } + } + } + + if (addTest) { + // Look pass the truncate if the high bits are known zero. + if (isTruncWithZeroHighBitsInput(Cond, DAG)) + Cond = Cond.getOperand(0); + + // We know the result of AND is compared against zero. Try to match + // it to BT. + if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { + if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) { + CC = NewSetCC.getOperand(0); + Cond = NewSetCC.getOperand(1); + addTest = false; + } + } + } + + if (addTest) { + X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE; + CC = DAG.getConstant(X86Cond, dl, MVT::i8); + Cond = EmitTest(Cond, X86Cond, dl, DAG); + } + Cond = ConvertCmpIfNecessary(Cond, DAG); + return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cond); +} + +// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. +// Calls to _alloca are needed to probe the stack when allocating more than 4k +// bytes in one go. Touching the stack at 4K increments is necessary to ensure +// that the guard pages used by the OS virtual memory manager are allocated in +// correct sequence. +SDValue +X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + bool SplitStack = MF.shouldSplitStack(); + bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) || + SplitStack; + SDLoc dl(Op); + + // Get the inputs. + SDNode *Node = Op.getNode(); + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + EVT VT = Node->getValueType(0); + + // Chain the dynamic stack allocation so that it doesn't modify the stack + // pointer when other instructions are using the stack. + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl); + + bool Is64Bit = Subtarget->is64Bit(); + MVT SPTy = getPointerTy(DAG.getDataLayout()); + + SDValue Result; + if (!Lower) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); + assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" + " not tell us which reg is the stack pointer!"); + EVT VT = Node->getValueType(0); + SDValue Tmp3 = Node->getOperand(2); + + SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); + Chain = SP.getValue(1); + unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue(); + const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); + unsigned StackAlign = TFI.getStackAlignment(); + Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value + if (Align > StackAlign) + Result = DAG.getNode(ISD::AND, dl, VT, Result, + DAG.getConstant(-(uint64_t)Align, dl, VT)); + Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain + } else if (SplitStack) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + if (Is64Bit) { + // The 64 bit implementation of segmented stacks needs to clobber both r10 + // r11. This makes it impossible to use it along with nested parameters. + const Function *F = MF.getFunction(); + + for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; ++I) + if (I->hasNestAttr()) + report_fatal_error("Cannot use segmented stacks with functions that " + "have nested arguments."); + } + + const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); + unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); + Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); + Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, + DAG.getRegister(Vreg, SPTy)); + } else { + SDValue Flag; + const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX); + + Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); + Flag = Chain.getValue(1); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + + Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); + + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + unsigned SPReg = RegInfo->getStackRegister(); + SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); + Chain = SP.getValue(1); + + if (Align) { + SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), + DAG.getConstant(-(uint64_t)Align, dl, VT)); + Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); + } + + Result = SP; + } + + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), + DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); + + SDValue Ops[2] = {Result, Chain}; + return DAG.getMergeValues(Ops, dl); +} + +SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + auto PtrVT = getPointerTy(MF.getDataLayout()); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + SDLoc DL(Op); + + if (!Subtarget->is64Bit() || + Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) { + // vastart just stores the address of the VarArgsFrameIndex slot into the + // memory location argument. + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); + return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), + MachinePointerInfo(SV), false, false, 0); + } + + // __va_list_tag: + // gp_offset (0 - 6 * 8) + // fp_offset (48 - 48 + 8 * 16) + // overflow_arg_area (point to parameters coming in memory). + // reg_save_area + SmallVector<SDValue, 8> MemOps; + SDValue FIN = Op.getOperand(1); + // Store gp_offset + SDValue Store = DAG.getStore(Op.getOperand(0), DL, + DAG.getConstant(FuncInfo->getVarArgsGPOffset(), + DL, MVT::i32), + FIN, MachinePointerInfo(SV), false, false, 0); + MemOps.push_back(Store); + + // Store fp_offset + FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); + Store = DAG.getStore(Op.getOperand(0), DL, + DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, + MVT::i32), + FIN, MachinePointerInfo(SV, 4), false, false, 0); + MemOps.push_back(Store); + + // Store ptr to overflow_arg_area + FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); + SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); + Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, + MachinePointerInfo(SV, 8), + false, false, 0); + MemOps.push_back(Store); + + // Store ptr to reg_save_area. + FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant( + Subtarget->isTarget64BitLP64() ? 8 : 4, DL)); + SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT); + Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo( + SV, Subtarget->isTarget64BitLP64() ? 16 : 12), false, false, 0); + MemOps.push_back(Store); + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); +} + +SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { + assert(Subtarget->is64Bit() && + "LowerVAARG only handles 64-bit va_arg!"); + assert(Op.getNode()->getNumOperands() == 4); + + MachineFunction &MF = DAG.getMachineFunction(); + if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) + // The Win64 ABI uses char* instead of a structure. + return DAG.expandVAArg(Op.getNode()); + + SDValue Chain = Op.getOperand(0); + SDValue SrcPtr = Op.getOperand(1); + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + unsigned Align = Op.getConstantOperandVal(3); + SDLoc dl(Op); + + EVT ArgVT = Op.getNode()->getValueType(0); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); + uint8_t ArgMode; + + // Decide which area this value should be read from. + // TODO: Implement the AMD64 ABI in its entirety. This simple + // selection mechanism works only for the basic types. + if (ArgVT == MVT::f80) { + llvm_unreachable("va_arg for f80 not yet implemented"); + } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { + ArgMode = 2; // Argument passed in XMM register. Use fp_offset. + } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { + ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. + } else { + llvm_unreachable("Unhandled argument type in LowerVAARG"); + } + + if (ArgMode == 2) { + // Sanity Check: Make sure using fp_offset makes sense. + assert(!Subtarget->useSoftFloat() && + !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) && + Subtarget->hasSSE1()); + } + + // Insert VAARG_64 node into the DAG + // VAARG_64 returns two values: Variable Argument Address, Chain + SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32), + DAG.getConstant(ArgMode, dl, MVT::i8), + DAG.getConstant(Align, dl, MVT::i32)}; + SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other); + SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, + VTs, InstOps, MVT::i64, + MachinePointerInfo(SV), + /*Align=*/0, + /*Volatile=*/false, + /*ReadMem=*/true, + /*WriteMem=*/true); + Chain = VAARG.getValue(1); + + // Load the next argument and return it + return DAG.getLoad(ArgVT, dl, + Chain, + VAARG, + MachinePointerInfo(), + false, false, false, 0); +} + +static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows, + // where a va_list is still an i8*. + assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); + if (Subtarget->isCallingConvWin64( + DAG.getMachineFunction().getFunction()->getCallingConv())) + // Probably a Win64 va_copy. + return DAG.expandVACopy(Op.getNode()); + + SDValue Chain = Op.getOperand(0); + SDValue DstPtr = Op.getOperand(1); + SDValue SrcPtr = Op.getOperand(2); + const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); + const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); + SDLoc DL(Op); + + return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, + DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false, + false, false, + MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); +} + +// getTargetVShiftByConstNode - Handle vector element shifts where the shift +// amount is a constant. Takes immediate version of shift as input. +static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, + SDValue SrcOp, uint64_t ShiftAmt, + SelectionDAG &DAG) { + MVT ElementType = VT.getVectorElementType(); + + // Fold this packed shift into its first operand if ShiftAmt is 0. + if (ShiftAmt == 0) + return SrcOp; + + // Check for ShiftAmt >= element width + if (ShiftAmt >= ElementType.getSizeInBits()) { + if (Opc == X86ISD::VSRAI) + ShiftAmt = ElementType.getSizeInBits() - 1; + else + return DAG.getConstant(0, dl, VT); + } + + assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) + && "Unknown target vector shift-by-constant node"); + + // Fold this packed vector shift into a build vector if SrcOp is a + // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT. + if (VT == SrcOp.getSimpleValueType() && + ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { + SmallVector<SDValue, 8> Elts; + unsigned NumElts = SrcOp->getNumOperands(); + ConstantSDNode *ND; + + switch(Opc) { + default: llvm_unreachable(nullptr); + case X86ISD::VSHLI: + for (unsigned i=0; i!=NumElts; ++i) { + SDValue CurrentOp = SrcOp->getOperand(i); + if (CurrentOp->getOpcode() == ISD::UNDEF) { + Elts.push_back(CurrentOp); + continue; + } + ND = cast<ConstantSDNode>(CurrentOp); + const APInt &C = ND->getAPIntValue(); + Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType)); + } + break; + case X86ISD::VSRLI: + for (unsigned i=0; i!=NumElts; ++i) { + SDValue CurrentOp = SrcOp->getOperand(i); + if (CurrentOp->getOpcode() == ISD::UNDEF) { + Elts.push_back(CurrentOp); + continue; + } + ND = cast<ConstantSDNode>(CurrentOp); + const APInt &C = ND->getAPIntValue(); + Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType)); + } + break; + case X86ISD::VSRAI: + for (unsigned i=0; i!=NumElts; ++i) { + SDValue CurrentOp = SrcOp->getOperand(i); + if (CurrentOp->getOpcode() == ISD::UNDEF) { + Elts.push_back(CurrentOp); + continue; + } + ND = cast<ConstantSDNode>(CurrentOp); + const APInt &C = ND->getAPIntValue(); + Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType)); + } + break; + } + + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); + } + + return DAG.getNode(Opc, dl, VT, SrcOp, + DAG.getConstant(ShiftAmt, dl, MVT::i8)); +} + +// getTargetVShiftNode - Handle vector element shifts where the shift amount +// may or may not be a constant. Takes immediate version of shift as input. +static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, + SDValue SrcOp, SDValue ShAmt, + SelectionDAG &DAG) { + MVT SVT = ShAmt.getSimpleValueType(); + assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"); + + // Catch shift-by-constant. + if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt)) + return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp, + CShAmt->getZExtValue(), DAG); + + // Change opcode to non-immediate version + switch (Opc) { + default: llvm_unreachable("Unknown target vector shift node"); + case X86ISD::VSHLI: Opc = X86ISD::VSHL; break; + case X86ISD::VSRLI: Opc = X86ISD::VSRL; break; + case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; + } + + const X86Subtarget &Subtarget = + static_cast<const X86Subtarget &>(DAG.getSubtarget()); + if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND && + ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { + // Let the shuffle legalizer expand this shift amount node. + SDValue Op0 = ShAmt.getOperand(0); + Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0); + ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG); + } else { + // Need to build a vector containing shift amount. + // SSE/AVX packed shifts only use the lower 64-bit of the shift count. + SmallVector<SDValue, 4> ShOps; + ShOps.push_back(ShAmt); + if (SVT == MVT::i32) { + ShOps.push_back(DAG.getConstant(0, dl, SVT)); + ShOps.push_back(DAG.getUNDEF(SVT)); + } + ShOps.push_back(DAG.getUNDEF(SVT)); + + MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64; + ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps); + } + + // The return type has to be a 128-bit type with the same element + // type as the input type. + MVT EltVT = VT.getVectorElementType(); + MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); + + ShAmt = DAG.getBitcast(ShVT, ShAmt); + return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); +} + +/// \brief Return Mask with the necessary casting or extending +/// for \p Mask according to \p MaskVT when lowering masking intrinsics +static SDValue getMaskNode(SDValue Mask, MVT MaskVT, + const X86Subtarget *Subtarget, + SelectionDAG &DAG, SDLoc dl) { + + if (MaskVT.bitsGT(Mask.getSimpleValueType())) { + // Mask should be extended + Mask = DAG.getNode(ISD::ANY_EXTEND, dl, + MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask); + } + + if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) { + if (MaskVT == MVT::v64i1) { + assert(Subtarget->hasBWI() && "Expected AVX512BW target!"); + // In case 32bit mode, bitcast i64 is illegal, extend/split it. + SDValue Lo, Hi; + Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, + DAG.getConstant(0, dl, MVT::i32)); + Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, + DAG.getConstant(1, dl, MVT::i32)); + + Lo = DAG.getBitcast(MVT::v32i1, Lo); + Hi = DAG.getBitcast(MVT::v32i1, Hi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); + } else { + // MaskVT require < 64bit. Truncate mask (should succeed in any case), + // and bitcast. + MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits()); + return DAG.getBitcast(MaskVT, + DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask)); + } + + } else { + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + } +} + +/// \brief Return (and \p Op, \p Mask) for compare instructions or +/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the +/// necessary casting or extending for \p Mask when lowering masking intrinsics +static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, + SDValue PreservedSrc, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + unsigned OpcodeSelect = ISD::VSELECT; + SDLoc dl(Op); + + if (isAllOnesConstant(Mask)) + return Op; + + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + + switch (Op.getOpcode()) { + default: break; + case X86ISD::PCMPEQM: + case X86ISD::PCMPGTM: + case X86ISD::CMPM: + case X86ISD::CMPMU: + return DAG.getNode(ISD::AND, dl, VT, Op, VMask); + case X86ISD::VFPCLASS: + case X86ISD::VFPCLASSS: + return DAG.getNode(ISD::OR, dl, VT, Op, VMask); + case X86ISD::VTRUNC: + case X86ISD::VTRUNCS: + case X86ISD::VTRUNCUS: + // We can't use ISD::VSELECT here because it is not always "Legal" + // for the destination type. For example vpmovqb require only AVX512 + // and vselect that can operate on byte element type require BWI + OpcodeSelect = X86ISD::SELECT; + break; + } + if (PreservedSrc.getOpcode() == ISD::UNDEF) + PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); +} + +/// \brief Creates an SDNode for a predicated scalar operation. +/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). +/// The mask is coming as MVT::i8 and it should be truncated +/// to MVT::i1 while lowering masking intrinsics. +/// The main difference between ScalarMaskingNode and VectorMaskingNode is using +/// "X86select" instead of "vselect". We just can't create the "vselect" node +/// for a scalar instruction. +static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, + SDValue PreservedSrc, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + if (isAllOnesConstant(Mask)) + return Op; + + MVT VT = Op.getSimpleValueType(); + SDLoc dl(Op); + // The mask should be of type MVT::i1 + SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); + + if (Op.getOpcode() == X86ISD::FSETCC) + return DAG.getNode(ISD::AND, dl, VT, Op, IMask); + if (Op.getOpcode() == X86ISD::VFPCLASS || + Op.getOpcode() == X86ISD::VFPCLASSS) + return DAG.getNode(ISD::OR, dl, VT, Op, IMask); + + if (PreservedSrc.getOpcode() == ISD::UNDEF) + PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); +} + +static int getSEHRegistrationNodeSize(const Function *Fn) { + if (!Fn->hasPersonalityFn()) + report_fatal_error( + "querying registration node size for function without personality"); + // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See + // WinEHStatePass for the full struct definition. + switch (classifyEHPersonality(Fn->getPersonalityFn())) { + case EHPersonality::MSVC_X86SEH: return 24; + case EHPersonality::MSVC_CXX: return 16; + default: break; + } + report_fatal_error( + "can only recover FP for 32-bit MSVC EH personality functions"); +} + +/// When the MSVC runtime transfers control to us, either to an outlined +/// function or when returning to a parent frame after catching an exception, we +/// recover the parent frame pointer by doing arithmetic on the incoming EBP. +/// Here's the math: +/// RegNodeBase = EntryEBP - RegNodeSize +/// ParentFP = RegNodeBase - ParentFrameOffset +/// Subtracting RegNodeSize takes us to the offset of the registration node, and +/// subtracting the offset (negative on x86) takes us back to the parent FP. +static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, + SDValue EntryEBP) { + MachineFunction &MF = DAG.getMachineFunction(); + SDLoc dl; + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); + + // It's possible that the parent function no longer has a personality function + // if the exceptional code was optimized away, in which case we just return + // the incoming EBP. + if (!Fn->hasPersonalityFn()) + return EntryEBP; + + // Get an MCSymbol that will ultimately resolve to the frame offset of the EH + // registration, or the .set_setframe offset. + MCSymbol *OffsetSym = + MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( + GlobalValue::getRealLinkageName(Fn->getName())); + SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); + SDValue ParentFrameOffset = + DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal); + + // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after + // prologue to RBP in the parent function. + const X86Subtarget &Subtarget = + static_cast<const X86Subtarget &>(DAG.getSubtarget()); + if (Subtarget.is64Bit()) + return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset); + + int RegNodeSize = getSEHRegistrationNodeSize(Fn); + // RegNodeBase = EntryEBP - RegNodeSize + // ParentFP = RegNodeBase - ParentFrameOffset + SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP, + DAG.getConstant(RegNodeSize, dl, PtrVT)); + return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset); +} + +static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc dl(Op); + unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + MVT VT = Op.getSimpleValueType(); + const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); + if (IntrData) { + switch(IntrData->Type) { + case INTR_TYPE_1OP: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); + case INTR_TYPE_2OP: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2)); + case INTR_TYPE_2OP_IMM8: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), + DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2))); + case INTR_TYPE_3OP: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); + case INTR_TYPE_4OP: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(4)); + case INTR_TYPE_1OP_MASK_RM: { + SDValue Src = Op.getOperand(1); + SDValue PassThru = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + SDValue RoundingMode; + // We allways add rounding mode to the Node. + // If the rounding mode is not specified, we add the + // "current direction" mode. + if (Op.getNumOperands() == 4) + RoundingMode = + DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); + else + RoundingMode = Op.getOperand(4); + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) + if (cast<ConstantSDNode>(RoundingMode)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) + return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, Op.getValueType(), Src, RoundingMode), + Mask, PassThru, Subtarget, DAG); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, + RoundingMode), + Mask, PassThru, Subtarget, DAG); + } + case INTR_TYPE_1OP_MASK: { + SDValue Src = Op.getOperand(1); + SDValue PassThru = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + // We add rounding mode to the Node when + // - RM Opcode is specified and + // - RM is not "current direction". + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(4); + unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); + if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { + return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, Op.getValueType(), + Src, Rnd), + Mask, PassThru, Subtarget, DAG); + } + } + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), + Mask, PassThru, Subtarget, DAG); + } + case INTR_TYPE_SCALAR_MASK: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue passThru = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), + Mask, passThru, Subtarget, DAG); + } + case INTR_TYPE_SCALAR_MASK_RM: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src0 = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + // There are 2 kinds of intrinsics in this group: + // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands + // (2) With rounding mode and sae - 7 operands. + if (Op.getNumOperands() == 6) { + SDValue Sae = Op.getOperand(5); + unsigned Opc = IntrData->Opc1 ? IntrData->Opc1 : IntrData->Opc0; + return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, + Sae), + Mask, Src0, Subtarget, DAG); + } + assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form"); + SDValue RoundingMode = Op.getOperand(5); + SDValue Sae = Op.getOperand(6); + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, + RoundingMode, Sae), + Mask, Src0, Subtarget, DAG); + } + case INTR_TYPE_2OP_MASK: + case INTR_TYPE_2OP_IMM8_MASK: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue PassThru = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + + if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK) + Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2); + + // We specify 2 possible opcodes for intrinsics with rounding modes. + // First, we check if the intrinsic may have non-default rounding mode, + // (IntrData->Opc1 != 0), then we check the rounding mode operand. + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(5); + unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); + if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { + return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, Op.getValueType(), + Src1, Src2, Rnd), + Mask, PassThru, Subtarget, DAG); + } + } + // TODO: Intrinsics should have fast-math-flags to propagate. + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2), + Mask, PassThru, Subtarget, DAG); + } + case INTR_TYPE_2OP_MASK_RM: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue PassThru = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + // We specify 2 possible modes for intrinsics, with/without rounding + // modes. + // First, we check if the intrinsic have rounding mode (6 operands), + // if not, we set rounding mode to "current". + SDValue Rnd; + if (Op.getNumOperands() == 6) + Rnd = Op.getOperand(5); + else + Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1, Src2, Rnd), + Mask, PassThru, Subtarget, DAG); + } + case INTR_TYPE_3OP_SCALAR_MASK_RM: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue PassThru = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + SDValue Sae = Op.getOperand(6); + + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, + Src2, Src3, Sae), + Mask, PassThru, Subtarget, DAG); + } + case INTR_TYPE_3OP_MASK_RM: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Imm = Op.getOperand(3); + SDValue PassThru = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + // We specify 2 possible modes for intrinsics, with/without rounding + // modes. + // First, we check if the intrinsic have rounding mode (7 operands), + // if not, we set rounding mode to "current". + SDValue Rnd; + if (Op.getNumOperands() == 7) + Rnd = Op.getOperand(6); + else + Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1, Src2, Imm, Rnd), + Mask, PassThru, Subtarget, DAG); + } + case INTR_TYPE_3OP_IMM8_MASK: + case INTR_TYPE_3OP_MASK: + case INSERT_SUBVEC: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue PassThru = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + + if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK) + Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3); + else if (IntrData->Type == INSERT_SUBVEC) { + // imm should be adapted to ISD::INSERT_SUBVECTOR behavior + assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!"); + unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue(); + Imm *= Src2.getSimpleValueType().getVectorNumElements(); + Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32); + } + + // We specify 2 possible opcodes for intrinsics with rounding modes. + // First, we check if the intrinsic may have non-default rounding mode, + // (IntrData->Opc1 != 0), then we check the rounding mode operand. + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(6); + unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); + if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { + return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, Op.getValueType(), + Src1, Src2, Src3, Rnd), + Mask, PassThru, Subtarget, DAG); + } + } + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1, Src2, Src3), + Mask, PassThru, Subtarget, DAG); + } + case VPERM_3OP_MASKZ: + case VPERM_3OP_MASK:{ + // Src2 is the PassThru + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + MVT VT = Op.getSimpleValueType(); + SDValue PassThru = SDValue(); + + // set PassThru element + if (IntrData->Type == VPERM_3OP_MASKZ) + PassThru = getZeroVector(VT, Subtarget, DAG, dl); + else + PassThru = DAG.getBitcast(VT, Src2); + + // Swap Src1 and Src2 in the node creation + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, + dl, Op.getValueType(), + Src2, Src1, Src3), + Mask, PassThru, Subtarget, DAG); + } + case FMA_OP_MASK3: + case FMA_OP_MASKZ: + case FMA_OP_MASK: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + MVT VT = Op.getSimpleValueType(); + SDValue PassThru = SDValue(); + + // set PassThru element + if (IntrData->Type == FMA_OP_MASKZ) + PassThru = getZeroVector(VT, Subtarget, DAG, dl); + else if (IntrData->Type == FMA_OP_MASK3) + PassThru = Src3; + else + PassThru = Src1; + + // We specify 2 possible opcodes for intrinsics with rounding modes. + // First, we check if the intrinsic may have non-default rounding mode, + // (IntrData->Opc1 != 0), then we check the rounding mode operand. + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(5); + if (cast<ConstantSDNode>(Rnd)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) + return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, Op.getValueType(), + Src1, Src2, Src3, Rnd), + Mask, PassThru, Subtarget, DAG); + } + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, + dl, Op.getValueType(), + Src1, Src2, Src3), + Mask, PassThru, Subtarget, DAG); + } + case TERLOG_OP_MASK: + case TERLOG_OP_MASKZ: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4)); + SDValue Mask = Op.getOperand(5); + MVT VT = Op.getSimpleValueType(); + SDValue PassThru = Src1; + // Set PassThru element. + if (IntrData->Type == TERLOG_OP_MASKZ) + PassThru = getZeroVector(VT, Subtarget, DAG, dl); + + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1, Src2, Src3, Src4), + Mask, PassThru, Subtarget, DAG); + } + case FPCLASS: { + // FPclass intrinsics with mask + SDValue Src1 = Op.getOperand(1); + MVT VT = Src1.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue Imm = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); + SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm); + SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, + DAG.getTargetConstant(0, dl, MaskVT), + Subtarget, DAG); + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, + DAG.getUNDEF(BitcastVT), FPclassMask, + DAG.getIntPtrConstant(0, dl)); + return DAG.getBitcast(Op.getValueType(), Res); + } + case FPCLASSS: { + SDValue Src1 = Op.getOperand(1); + SDValue Imm = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm); + SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, + DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); + return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask); + } + case CMP_MASK: + case CMP_MASK_CC: { + // Comparison intrinsics with masks. + // Example of transformation: + // (i8 (int_x86_avx512_mask_pcmpeq_q_128 + // (v2i64 %a), (v2i64 %b), (i8 %mask))) -> + // (i8 (bitcast + // (v8i1 (insert_subvector undef, + // (v2i1 (and (PCMPEQM %a, %b), + // (extract_subvector + // (v8i1 (bitcast %mask)), 0))), 0)))) + MVT VT = Op.getOperand(1).getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); + SDValue Cmp; + if (IntrData->Type == CMP_MASK_CC) { + SDValue CC = Op.getOperand(3); + CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC); + // We specify 2 possible opcodes for intrinsics with rounding modes. + // First, we check if the intrinsic may have non-default rounding mode, + // (IntrData->Opc1 != 0), then we check the rounding mode operand. + if (IntrData->Opc1 != 0) { + SDValue Rnd = Op.getOperand(5); + if (cast<ConstantSDNode>(Rnd)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) + Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2), CC, Rnd); + } + //default rounding mode + if(!Cmp.getNode()) + Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2), CC); + + } else { + assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!"); + Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2)); + } + SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, + DAG.getTargetConstant(0, dl, + MaskVT), + Subtarget, DAG); + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, + DAG.getUNDEF(BitcastVT), CmpMask, + DAG.getIntPtrConstant(0, dl)); + return DAG.getBitcast(Op.getValueType(), Res); + } + case CMP_MASK_SCALAR_CC: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3)); + SDValue Mask = Op.getOperand(4); + + SDValue Cmp; + if (IntrData->Opc1 != 0) { + SDValue Rnd = Op.getOperand(5); + if (cast<ConstantSDNode>(Rnd)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) + Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd); + } + //default rounding mode + if(!Cmp.getNode()) + Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC); + + SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, + DAG.getTargetConstant(0, dl, + MVT::i1), + Subtarget, DAG); + + return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i8, + DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, CmpMask), + DAG.getValueType(MVT::i1)); + } + case COMI: { // Comparison intrinsics + ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + unsigned X86CC = TranslateX86CC(CC, dl, true, LHS, RHS, DAG); + assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); + SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86CC, dl, MVT::i8), Cond); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } + case COMI_RM: { // Comparison intrinsics with Sae + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + SDValue CC = Op.getOperand(3); + SDValue Sae = Op.getOperand(4); + auto ComiType = TranslateX86ConstCondToX86CC(CC); + // choose between ordered and unordered (comi/ucomi) + unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1; + SDValue Cond; + if (cast<ConstantSDNode>(Sae)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) + Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae); + else + Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } + case VSHIFT: + return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), + Op.getOperand(1), Op.getOperand(2), DAG); + case VSHIFT_MASK: + return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, + Op.getSimpleValueType(), + Op.getOperand(1), + Op.getOperand(2), DAG), + Op.getOperand(4), Op.getOperand(3), Subtarget, + DAG); + case COMPRESS_EXPAND_IN_REG: { + SDValue Mask = Op.getOperand(3); + SDValue DataToCompress = Op.getOperand(1); + SDValue PassThru = Op.getOperand(2); + if (isAllOnesConstant(Mask)) // return data as is + return Op.getOperand(1); + + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + DataToCompress), + Mask, PassThru, Subtarget, DAG); + } + case BROADCASTM: { + SDValue Mask = Op.getOperand(1); + MVT MaskVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); + Mask = DAG.getBitcast(MaskVT, Mask); + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask); + } + case BLEND: { + SDValue Mask = Op.getOperand(3); + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1), + Op.getOperand(2)); + } + case KUNPCK: { + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2); + + SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl); + SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl); + // Arguments should be swapped. + SDValue Res = DAG.getNode(IntrData->Opc0, dl, + MVT::getVectorVT(MVT::i1, VT.getSizeInBits()), + Src2, Src1); + return DAG.getBitcast(VT, Res); + } + case CONVERT_TO_MASK: { + MVT SrcVT = Op.getOperand(1).getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()); + + SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT, + Op.getOperand(1)); + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, + DAG.getUNDEF(BitcastVT), CvtMask, + DAG.getIntPtrConstant(0, dl)); + return DAG.getBitcast(Op.getValueType(), Res); + } + case CONVERT_MASK_TO_VEC: { + SDValue Mask = Op.getOperand(1); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + return DAG.getNode(IntrData->Opc0, dl, VT, VMask); + } + case BRCST_SUBVEC_TO_VEC: { + SDValue Src = Op.getOperand(1); + SDValue Passthru = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + EVT resVT = Passthru.getValueType(); + SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT, + DAG.getUNDEF(resVT), Src, + DAG.getIntPtrConstant(0, dl)); + SDValue immVal; + if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector()) + immVal = DAG.getConstant(0x44, dl, MVT::i8); + else + immVal = DAG.getConstant(0, dl, MVT::i8); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + subVec, subVec, immVal), + Mask, Passthru, Subtarget, DAG); + } + default: + break; + } + } + + switch (IntNo) { + default: return SDValue(); // Don't custom lower most intrinsics. + + case Intrinsic::x86_avx2_permd: + case Intrinsic::x86_avx2_permps: + // Operands intentionally swapped. Mask is last operand to intrinsic, + // but second operand for node/instruction. + return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(1)); + + // ptest and testp intrinsics. The intrinsic these come from are designed to + // return an integer value, not just an instruction so lower it to the ptest + // or testp pattern and a setcc for the result. + case Intrinsic::x86_sse41_ptestz: + case Intrinsic::x86_sse41_ptestc: + case Intrinsic::x86_sse41_ptestnzc: + case Intrinsic::x86_avx_ptestz_256: + case Intrinsic::x86_avx_ptestc_256: + case Intrinsic::x86_avx_ptestnzc_256: + case Intrinsic::x86_avx_vtestz_ps: + case Intrinsic::x86_avx_vtestc_ps: + case Intrinsic::x86_avx_vtestnzc_ps: + case Intrinsic::x86_avx_vtestz_pd: + case Intrinsic::x86_avx_vtestc_pd: + case Intrinsic::x86_avx_vtestnzc_pd: + case Intrinsic::x86_avx_vtestz_ps_256: + case Intrinsic::x86_avx_vtestc_ps_256: + case Intrinsic::x86_avx_vtestnzc_ps_256: + case Intrinsic::x86_avx_vtestz_pd_256: + case Intrinsic::x86_avx_vtestc_pd_256: + case Intrinsic::x86_avx_vtestnzc_pd_256: { + bool IsTestPacked = false; + unsigned X86CC; + switch (IntNo) { + default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); + case Intrinsic::x86_avx_vtestz_ps: + case Intrinsic::x86_avx_vtestz_pd: + case Intrinsic::x86_avx_vtestz_ps_256: + case Intrinsic::x86_avx_vtestz_pd_256: + IsTestPacked = true; // Fallthrough + case Intrinsic::x86_sse41_ptestz: + case Intrinsic::x86_avx_ptestz_256: + // ZF = 1 + X86CC = X86::COND_E; + break; + case Intrinsic::x86_avx_vtestc_ps: + case Intrinsic::x86_avx_vtestc_pd: + case Intrinsic::x86_avx_vtestc_ps_256: + case Intrinsic::x86_avx_vtestc_pd_256: + IsTestPacked = true; // Fallthrough + case Intrinsic::x86_sse41_ptestc: + case Intrinsic::x86_avx_ptestc_256: + // CF = 1 + X86CC = X86::COND_B; + break; + case Intrinsic::x86_avx_vtestnzc_ps: + case Intrinsic::x86_avx_vtestnzc_pd: + case Intrinsic::x86_avx_vtestnzc_ps_256: + case Intrinsic::x86_avx_vtestnzc_pd_256: + IsTestPacked = true; // Fallthrough + case Intrinsic::x86_sse41_ptestnzc: + case Intrinsic::x86_avx_ptestnzc_256: + // ZF and CF = 0 + X86CC = X86::COND_A; + break; + } + + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; + SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); + SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } + case Intrinsic::x86_avx512_kortestz_w: + case Intrinsic::x86_avx512_kortestc_w: { + unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B; + SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); + SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); + SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8); + SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } + + case Intrinsic::x86_sse42_pcmpistria128: + case Intrinsic::x86_sse42_pcmpestria128: + case Intrinsic::x86_sse42_pcmpistric128: + case Intrinsic::x86_sse42_pcmpestric128: + case Intrinsic::x86_sse42_pcmpistrio128: + case Intrinsic::x86_sse42_pcmpestrio128: + case Intrinsic::x86_sse42_pcmpistris128: + case Intrinsic::x86_sse42_pcmpestris128: + case Intrinsic::x86_sse42_pcmpistriz128: + case Intrinsic::x86_sse42_pcmpestriz128: { + unsigned Opcode; + unsigned X86CC; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_sse42_pcmpistria128: + Opcode = X86ISD::PCMPISTRI; + X86CC = X86::COND_A; + break; + case Intrinsic::x86_sse42_pcmpestria128: + Opcode = X86ISD::PCMPESTRI; + X86CC = X86::COND_A; + break; + case Intrinsic::x86_sse42_pcmpistric128: + Opcode = X86ISD::PCMPISTRI; + X86CC = X86::COND_B; + break; + case Intrinsic::x86_sse42_pcmpestric128: + Opcode = X86ISD::PCMPESTRI; + X86CC = X86::COND_B; + break; + case Intrinsic::x86_sse42_pcmpistrio128: + Opcode = X86ISD::PCMPISTRI; + X86CC = X86::COND_O; + break; + case Intrinsic::x86_sse42_pcmpestrio128: + Opcode = X86ISD::PCMPESTRI; + X86CC = X86::COND_O; + break; + case Intrinsic::x86_sse42_pcmpistris128: + Opcode = X86ISD::PCMPISTRI; + X86CC = X86::COND_S; + break; + case Intrinsic::x86_sse42_pcmpestris128: + Opcode = X86ISD::PCMPESTRI; + X86CC = X86::COND_S; + break; + case Intrinsic::x86_sse42_pcmpistriz128: + Opcode = X86ISD::PCMPISTRI; + X86CC = X86::COND_E; + break; + case Intrinsic::x86_sse42_pcmpestriz128: + Opcode = X86ISD::PCMPESTRI; + X86CC = X86::COND_E; + break; + } + SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86CC, dl, MVT::i8), + SDValue(PCMP.getNode(), 1)); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } + + case Intrinsic::x86_sse42_pcmpistri128: + case Intrinsic::x86_sse42_pcmpestri128: { + unsigned Opcode; + if (IntNo == Intrinsic::x86_sse42_pcmpistri128) + Opcode = X86ISD::PCMPISTRI; + else + Opcode = X86ISD::PCMPESTRI; + + SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + return DAG.getNode(Opcode, dl, VTs, NewOps); + } + + case Intrinsic::x86_seh_lsda: { + // Compute the symbol for the LSDA. We know it'll get emitted later. + MachineFunction &MF = DAG.getMachineFunction(); + SDValue Op1 = Op.getOperand(1); + auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal()); + MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol( + GlobalValue::getRealLinkageName(Fn->getName())); + + // Generate a simple absolute symbol reference. This intrinsic is only + // supported on 32-bit Windows, which isn't PIC. + SDValue Result = DAG.getMCSymbol(LSDASym, VT); + return DAG.getNode(X86ISD::Wrapper, dl, VT, Result); + } + + case Intrinsic::x86_seh_recoverfp: { + SDValue FnOp = Op.getOperand(1); + SDValue IncomingFPOp = Op.getOperand(2); + GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp); + auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr); + if (!Fn) + report_fatal_error( + "llvm.x86.seh.recoverfp must take a function as the first argument"); + return recoverFramePointer(DAG, Fn, IncomingFPOp); + } + + case Intrinsic::localaddress: { + // Returns one of the stack, base, or frame pointer registers, depending on + // which is used to reference local variables. + MachineFunction &MF = DAG.getMachineFunction(); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + unsigned Reg; + if (RegInfo->hasBasePointer(MF)) + Reg = RegInfo->getBaseRegister(); + else // This function handles the SP or FP case. + Reg = RegInfo->getPtrSizedFrameRegister(MF); + return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); + } + } +} + +static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain, + const X86Subtarget * Subtarget) { + SDLoc dl(Op); + auto *C = cast<ConstantSDNode>(ScaleOp); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); + MVT MaskVT = MVT::getVectorVT(MVT::i1, + Index.getSimpleValueType().getVectorNumElements()); + SDValue MaskInReg; + ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); + if (MaskC) + MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); + else { + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); + + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + } + SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); + SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + if (Src.getOpcode() == ISD::UNDEF) + Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); + SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); + SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; + return DAG.getMergeValues(RetOps, dl); +} + +static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain) { + SDLoc dl(Op); + auto *C = cast<ConstantSDNode>(ScaleOp); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); + SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + MVT MaskVT = MVT::getVectorVT(MVT::i1, + Index.getSimpleValueType().getVectorNumElements()); + SDValue MaskInReg; + ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); + if (MaskC) + MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); + else { + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); + + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + } + SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); + SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); + return SDValue(Res, 1); +} + +static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Mask, SDValue Base, SDValue Index, + SDValue ScaleOp, SDValue Chain) { + SDLoc dl(Op); + auto *C = cast<ConstantSDNode>(ScaleOp); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); + SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + MVT MaskVT = + MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); + SDValue MaskInReg; + ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); + if (MaskC) + MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); + else + MaskInReg = DAG.getBitcast(MaskVT, Mask); + //SDVTList VTs = DAG.getVTList(MVT::Other); + SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); + return SDValue(Res, 0); +} + +// getReadPerformanceCounter - Handles the lowering of builtin intrinsics that +// read performance monitor counters (x86_rdpmc). +static void getReadPerformanceCounter(SDNode *N, SDLoc DL, + SelectionDAG &DAG, const X86Subtarget *Subtarget, + SmallVectorImpl<SDValue> &Results) { + assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue LO, HI; + + // The ECX register is used to select the index of the performance counter + // to read. + SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, + N->getOperand(2)); + SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain); + + // Reads the content of a 64-bit performance counter and returns it in the + // registers EDX:EAX. + if (Subtarget->is64Bit()) { + LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, + LO.getValue(2)); + } else { + LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, + LO.getValue(2)); + } + Chain = HI.getValue(1); + + if (Subtarget->is64Bit()) { + // The EAX register is loaded with the low-order 32 bits. The EDX register + // is loaded with the supported high-order bits of the counter. + SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, + DAG.getConstant(32, DL, MVT::i8)); + Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); + Results.push_back(Chain); + return; + } + + // Use a buildpair to merge the two 32-bit values into a 64-bit one. + SDValue Ops[] = { LO, HI }; + SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); + Results.push_back(Pair); + Results.push_back(Chain); +} + +// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that +// read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is +// also used to custom lower READCYCLECOUNTER nodes. +static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode, + SelectionDAG &DAG, const X86Subtarget *Subtarget, + SmallVectorImpl<SDValue> &Results) { + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0)); + SDValue LO, HI; + + // The processor's time-stamp counter (a 64-bit MSR) is stored into the + // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR + // and the EAX register is loaded with the low-order 32 bits. + if (Subtarget->is64Bit()) { + LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, + LO.getValue(2)); + } else { + LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1)); + HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, + LO.getValue(2)); + } + SDValue Chain = HI.getValue(1); + + if (Opcode == X86ISD::RDTSCP_DAG) { + assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); + + // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into + // the ECX register. Add 'ecx' explicitly to the chain. + SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, + HI.getValue(2)); + // Explicitly store the content of ECX at the location passed in input + // to the 'rdtscp' intrinsic. + Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2), + MachinePointerInfo(), false, false, 0); + } + + if (Subtarget->is64Bit()) { + // The EDX register is loaded with the high-order 32 bits of the MSR, and + // the EAX register is loaded with the low-order 32 bits. + SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, + DAG.getConstant(32, DL, MVT::i8)); + Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); + Results.push_back(Chain); + return; + } + + // Use a buildpair to merge the two 32-bit values into a 64-bit one. + SDValue Ops[] = { LO, HI }; + SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); + Results.push_back(Pair); + Results.push_back(Chain); +} + +static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SmallVector<SDValue, 2> Results; + SDLoc DL(Op); + getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget, + Results); + return DAG.getMergeValues(Results, DL); +} + +static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) { + MachineFunction &MF = DAG.getMachineFunction(); + SDValue Chain = Op.getOperand(0); + SDValue RegNode = Op.getOperand(2); + WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); + if (!EHInfo) + report_fatal_error("EH registrations only live in functions using WinEH"); + + // Cast the operand to an alloca, and remember the frame index. + auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode); + if (!FINode) + report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca"); + EHInfo->EHRegNodeFrameIndex = FINode->getIndex(); + + // Return the chain operand without making any DAG nodes. + return Chain; +} + +/// \brief Lower intrinsics for TRUNCATE_TO_MEM case +/// return truncate Store/MaskedStore Node +static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op, + SelectionDAG &DAG, + MVT ElementType) { + SDLoc dl(Op); + SDValue Mask = Op.getOperand(4); + SDValue DataToTruncate = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + + MVT VT = DataToTruncate.getSimpleValueType(); + MVT SVT = MVT::getVectorVT(ElementType, VT.getVectorNumElements()); + + if (isAllOnesConstant(Mask)) // return just a truncate store + return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, + MachinePointerInfo(), SVT, false, false, + SVT.getScalarSizeInBits()/8); + + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + + MachineMemOperand *MMO = DAG.getMachineFunction(). + getMachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOStore, SVT.getStoreSize(), + SVT.getScalarSizeInBits()/8); + + return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, + VMask, SVT, MMO, true); +} + +static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + + const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo); + if (!IntrData) { + if (IntNo == llvm::Intrinsic::x86_seh_ehregnode) + return MarkEHRegistrationNode(Op, DAG); + if (IntNo == llvm::Intrinsic::x86_flags_read_u32 || + IntNo == llvm::Intrinsic::x86_flags_read_u64 || + IntNo == llvm::Intrinsic::x86_flags_write_u32 || + IntNo == llvm::Intrinsic::x86_flags_write_u64) { + // We need a frame pointer because this will get lowered to a PUSH/POP + // sequence. + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setHasOpaqueSPAdjustment(true); + // Don't do anything here, we will expand these intrinsics out later + // during ExpandISelPseudos in EmitInstrWithCustomInserter. + return SDValue(); + } + return SDValue(); + } + + SDLoc dl(Op); + switch(IntrData->Type) { + default: llvm_unreachable("Unknown Intrinsic Type"); + case RDSEED: + case RDRAND: { + // Emit the node with the right value type. + SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); + SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); + + // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. + // Otherwise return the value from Rand, which is always 0, casted to i32. + SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), + DAG.getConstant(1, dl, Op->getValueType(1)), + DAG.getConstant(X86::COND_B, dl, MVT::i32), + SDValue(Result.getNode(), 1) }; + SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, + DAG.getVTList(Op->getValueType(1), MVT::Glue), + Ops); + + // Return { result, isValid, chain }. + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, + SDValue(Result.getNode(), 2)); + } + case GATHER: { + //gather(v1, mask, index, base, scale); + SDValue Chain = Op.getOperand(0); + SDValue Src = Op.getOperand(2); + SDValue Base = Op.getOperand(3); + SDValue Index = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + SDValue Scale = Op.getOperand(6); + return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, + Chain, Subtarget); + } + case SCATTER: { + //scatter(base, mask, index, v1, scale); + SDValue Chain = Op.getOperand(0); + SDValue Base = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + SDValue Index = Op.getOperand(4); + SDValue Src = Op.getOperand(5); + SDValue Scale = Op.getOperand(6); + return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, + Scale, Chain); + } + case PREFETCH: { + SDValue Hint = Op.getOperand(6); + unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue(); + assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1"); + unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0); + SDValue Chain = Op.getOperand(0); + SDValue Mask = Op.getOperand(2); + SDValue Index = Op.getOperand(3); + SDValue Base = Op.getOperand(4); + SDValue Scale = Op.getOperand(5); + return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain); + } + // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). + case RDTSC: { + SmallVector<SDValue, 2> Results; + getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, + Results); + return DAG.getMergeValues(Results, dl); + } + // Read Performance Monitoring Counters. + case RDPMC: { + SmallVector<SDValue, 2> Results; + getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results); + return DAG.getMergeValues(Results, dl); + } + // XTEST intrinsics. + case XTEST: { + SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); + SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_NE, dl, MVT::i8), + InTrans); + SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), + Ret, SDValue(InTrans.getNode(), 1)); + } + // ADC/ADCX/SBB + case ADX: { + SmallVector<SDValue, 2> Results; + SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other); + SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other); + SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2), + DAG.getConstant(-1, dl, MVT::i8)); + SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3), + Op.getOperand(4), GenCF.getValue(1)); + SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0), + Op.getOperand(5), MachinePointerInfo(), + false, false, 0); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_B, dl, MVT::i8), + Res.getValue(1)); + Results.push_back(SetCC); + Results.push_back(Store); + return DAG.getMergeValues(Results, dl); + } + case COMPRESS_TO_MEM: { + SDLoc dl(Op); + SDValue Mask = Op.getOperand(4); + SDValue DataToCompress = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + + MVT VT = DataToCompress.getSimpleValueType(); + if (isAllOnesConstant(Mask)) // return just a store + return DAG.getStore(Chain, dl, DataToCompress, Addr, + MachinePointerInfo(), false, false, + VT.getScalarSizeInBits()/8); + + SDValue Compressed = + getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress), + Mask, DAG.getUNDEF(VT), Subtarget, DAG); + return DAG.getStore(Chain, dl, Compressed, Addr, + MachinePointerInfo(), false, false, + VT.getScalarSizeInBits()/8); + } + case TRUNCATE_TO_MEM_VI8: + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i8); + case TRUNCATE_TO_MEM_VI16: + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i16); + case TRUNCATE_TO_MEM_VI32: + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32); + case EXPAND_FROM_MEM: { + SDLoc dl(Op); + SDValue Mask = Op.getOperand(4); + SDValue PassThru = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + MVT VT = Op.getSimpleValueType(); + + if (isAllOnesConstant(Mask)) // return just a load + return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, + false, VT.getScalarSizeInBits()/8); + + SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), + false, false, false, + VT.getScalarSizeInBits()/8); + + SDValue Results[] = { + getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand), + Mask, PassThru, Subtarget, DAG), Chain}; + return DAG.getMergeValues(Results, dl); + } + } +} + +SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, + SelectionDAG &DAG) const { + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setReturnAddressIsTaken(true); + + if (verifyReturnAddressArgumentIsConstant(Op, DAG)) + return SDValue(); + + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + SDLoc dl(Op); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + + if (Depth > 0) { + SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT); + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, dl, PtrVT, + FrameAddr, Offset), + MachinePointerInfo(), false, false, false, 0); + } + + // Just load the return address. + SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), + RetAddrFI, MachinePointerInfo(), false, false, false, 0); +} + +SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + EVT VT = Op.getValueType(); + + MFI->setFrameAddressIsTaken(true); + + if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) { + // Depth > 0 makes no sense on targets which use Windows unwind codes. It + // is not possible to crawl up the stack without looking at the unwind codes + // simultaneously. + int FrameAddrIndex = FuncInfo->getFAIndex(); + if (!FrameAddrIndex) { + // Set up a frame object for the return address. + unsigned SlotSize = RegInfo->getSlotSize(); + FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject( + SlotSize, /*Offset=*/0, /*IsImmutable=*/false); + FuncInfo->setFAIndex(FrameAddrIndex); + } + return DAG.getFrameIndex(FrameAddrIndex, VT); + } + + unsigned FrameReg = + RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); + SDLoc dl(Op); // FIXME probably not meaningful + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + assert(((FrameReg == X86::RBP && VT == MVT::i64) || + (FrameReg == X86::EBP && VT == MVT::i32)) && + "Invalid Frame Register!"); + SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); + while (Depth--) + FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, + MachinePointerInfo(), + false, false, false, 0); + return FrameAddr; +} + +// FIXME? Maybe this could be a TableGen attribute on some registers and +// this table could be generated automatically from RegInfo. +unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const { + const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); + const MachineFunction &MF = DAG.getMachineFunction(); + + unsigned Reg = StringSwitch<unsigned>(RegName) + .Case("esp", X86::ESP) + .Case("rsp", X86::RSP) + .Case("ebp", X86::EBP) + .Case("rbp", X86::RBP) + .Default(0); + + if (Reg == X86::EBP || Reg == X86::RBP) { + if (!TFI.hasFP(MF)) + report_fatal_error("register " + StringRef(RegName) + + " is allocatable: function has no frame pointer"); +#ifndef NDEBUG + else { + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + unsigned FrameReg = + RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); + assert((FrameReg == X86::EBP || FrameReg == X86::RBP) && + "Invalid Frame Register!"); + } +#endif + } + + if (Reg) + return Reg; + + report_fatal_error("Invalid register name global variable"); +} + +SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, + SelectionDAG &DAG) const { + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op)); +} + +unsigned X86TargetLowering::getExceptionPointerRegister( + const Constant *PersonalityFn) const { + if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR) + return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX; + + return Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX; +} + +unsigned X86TargetLowering::getExceptionSelectorRegister( + const Constant *PersonalityFn) const { + // Funclet personalities don't use selectors (the runtime does the selection). + assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))); + return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX; +} + +SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + SDValue Offset = Op.getOperand(1); + SDValue Handler = Op.getOperand(2); + SDLoc dl (Op); + + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); + assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || + (FrameReg == X86::EBP && PtrVT == MVT::i32)) && + "Invalid Frame Register!"); + SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); + unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; + + SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, + DAG.getIntPtrConstant(RegInfo->getSlotSize(), + dl)); + StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); + Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), + false, false, 0); + Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); + + return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain, + DAG.getRegister(StoreAddrReg, PtrVT)); +} + +SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, + DAG.getVTList(MVT::i32, MVT::Other), + Op.getOperand(0), Op.getOperand(1)); +} + +SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, + Op.getOperand(0), Op.getOperand(1)); +} + +static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { + return Op.getOperand(0); +} + +SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, + SelectionDAG &DAG) const { + SDValue Root = Op.getOperand(0); + SDValue Trmp = Op.getOperand(1); // trampoline + SDValue FPtr = Op.getOperand(2); // nested function + SDValue Nest = Op.getOperand(3); // 'nest' parameter value + SDLoc dl (Op); + + const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); + const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); + + if (Subtarget->is64Bit()) { + SDValue OutChains[6]; + + // Large code-model. + const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. + const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. + + const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; + const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; + + const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix + + // Load the pointer to the nested function into R11. + unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 + SDValue Addr = Trmp; + OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), + Addr, MachinePointerInfo(TrmpAddr), + false, false, 0); + + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(2, dl, MVT::i64)); + OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, + MachinePointerInfo(TrmpAddr, 2), + false, false, 2); + + // Load the 'nest' parameter value into R10. + // R10 is specified in X86CallingConv.td + OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(10, dl, MVT::i64)); + OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), + Addr, MachinePointerInfo(TrmpAddr, 10), + false, false, 0); + + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(12, dl, MVT::i64)); + OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, + MachinePointerInfo(TrmpAddr, 12), + false, false, 2); + + // Jump to the nested function. + OpCode = (JMP64r << 8) | REX_WB; // jmpq *... + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(20, dl, MVT::i64)); + OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), + Addr, MachinePointerInfo(TrmpAddr, 20), + false, false, 0); + + unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(22, dl, MVT::i64)); + OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8), + Addr, MachinePointerInfo(TrmpAddr, 22), + false, false, 0); + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); + } else { + const Function *Func = + cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); + CallingConv::ID CC = Func->getCallingConv(); + unsigned NestReg; + + switch (CC) { + default: + llvm_unreachable("Unsupported calling convention"); + case CallingConv::C: + case CallingConv::X86_StdCall: { + // Pass 'nest' parameter in ECX. + // Must be kept in sync with X86CallingConv.td + NestReg = X86::ECX; + + // Check that ECX wasn't needed by an 'inreg' parameter. + FunctionType *FTy = Func->getFunctionType(); + const AttributeSet &Attrs = Func->getAttributes(); + + if (!Attrs.isEmpty() && !Func->isVarArg()) { + unsigned InRegCount = 0; + unsigned Idx = 1; + + for (FunctionType::param_iterator I = FTy->param_begin(), + E = FTy->param_end(); I != E; ++I, ++Idx) + if (Attrs.hasAttribute(Idx, Attribute::InReg)) { + auto &DL = DAG.getDataLayout(); + // FIXME: should only count parameters that are lowered to integers. + InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32; + } + + if (InRegCount > 2) { + report_fatal_error("Nest register in use - reduce number of inreg" + " parameters!"); + } + } + break; + } + case CallingConv::X86_FastCall: + case CallingConv::X86_ThisCall: + case CallingConv::Fast: + // Pass 'nest' parameter in EAX. + // Must be kept in sync with X86CallingConv.td + NestReg = X86::EAX; + break; + } + + SDValue OutChains[4]; + SDValue Addr, Disp; + + Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, + DAG.getConstant(10, dl, MVT::i32)); + Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); + + // This is storing the opcode for MOV32ri. + const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. + const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; + OutChains[0] = DAG.getStore(Root, dl, + DAG.getConstant(MOV32ri|N86Reg, dl, MVT::i8), + Trmp, MachinePointerInfo(TrmpAddr), + false, false, 0); + + Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, + DAG.getConstant(1, dl, MVT::i32)); + OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, + MachinePointerInfo(TrmpAddr, 1), + false, false, 1); + + const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. + Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, + DAG.getConstant(5, dl, MVT::i32)); + OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), + Addr, MachinePointerInfo(TrmpAddr, 5), + false, false, 1); + + Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, + DAG.getConstant(6, dl, MVT::i32)); + OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, + MachinePointerInfo(TrmpAddr, 6), + false, false, 1); + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); + } +} + +SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, + SelectionDAG &DAG) const { + /* + The rounding mode is in bits 11:10 of FPSR, and has the following + settings: + 00 Round to nearest + 01 Round to -inf + 10 Round to +inf + 11 Round to 0 + + FLT_ROUNDS, on the other hand, expects the following: + -1 Undefined + 0 Round to 0 + 1 Round to nearest + 2 Round to +inf + 3 Round to -inf + + To perform the conversion, we do: + (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) + */ + + MachineFunction &MF = DAG.getMachineFunction(); + const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); + unsigned StackAlignment = TFI.getStackAlignment(); + MVT VT = Op.getSimpleValueType(); + SDLoc DL(Op); + + // Save FP Control Word to stack slot + int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); + SDValue StackSlot = + DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); + + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), + MachineMemOperand::MOStore, 2, 2); + + SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; + SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, + DAG.getVTList(MVT::Other), + Ops, MVT::i16, MMO); + + // Load FP Control Word from stack slot + SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, + MachinePointerInfo(), false, false, false, 0); + + // Transform as necessary + SDValue CWD1 = + DAG.getNode(ISD::SRL, DL, MVT::i16, + DAG.getNode(ISD::AND, DL, MVT::i16, + CWD, DAG.getConstant(0x800, DL, MVT::i16)), + DAG.getConstant(11, DL, MVT::i8)); + SDValue CWD2 = + DAG.getNode(ISD::SRL, DL, MVT::i16, + DAG.getNode(ISD::AND, DL, MVT::i16, + CWD, DAG.getConstant(0x400, DL, MVT::i16)), + DAG.getConstant(9, DL, MVT::i8)); + + SDValue RetVal = + DAG.getNode(ISD::AND, DL, MVT::i16, + DAG.getNode(ISD::ADD, DL, MVT::i16, + DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), + DAG.getConstant(1, DL, MVT::i16)), + DAG.getConstant(3, DL, MVT::i16)); + + return DAG.getNode((VT.getSizeInBits() < 16 ? + ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); +} + +/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction. +// +// 1. i32/i64 128/256-bit vector (native support require VLX) are expended +// to 512-bit vector. +// 2. i8/i16 vector implemented using dword LZCNT vector instruction +// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal, +// split the vector, perform operation on it's Lo a Hi part and +// concatenate the results. +static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) { + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElems = VT.getVectorNumElements(); + + if (EltVT == MVT::i64 || EltVT == MVT::i32) { + // Extend to 512 bit vector. + assert((VT.is256BitVector() || VT.is128BitVector()) && + "Unsupported value type for operation"); + + MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits()); + SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, + DAG.getUNDEF(NewVT), + Op.getOperand(0), + DAG.getIntPtrConstant(0, dl)); + SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512); + + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode, + DAG.getIntPtrConstant(0, dl)); + } + + assert((EltVT == MVT::i8 || EltVT == MVT::i16) && + "Unsupported element type"); + + if (16 < NumElems) { + // Split vector, it's Lo and Hi parts will be handled in next iteration. + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl); + MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2); + + Lo = DAG.getNode(Op.getOpcode(), dl, OutVT, Lo); + Hi = DAG.getNode(Op.getOpcode(), dl, OutVT, Hi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); + } + + MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); + + assert((NewVT.is256BitVector() || NewVT.is512BitVector()) && + "Unsupported value type for operation"); + + // Use native supported vector instruction vplzcntd. + Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0)); + SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op); + SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode); + SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT); + + return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta); +} + +static SDValue LowerCTLZ(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + MVT OpVT = VT; + unsigned NumBits = VT.getSizeInBits(); + SDLoc dl(Op); + + if (VT.isVector() && Subtarget->hasAVX512()) + return LowerVectorCTLZ_AVX512(Op, DAG); + + Op = Op.getOperand(0); + if (VT == MVT::i8) { + // Zero extend to i32 since there is not an i8 bsr. + OpVT = MVT::i32; + Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); + } + + // Issue a bsr (scan bits in reverse) which also sets EFLAGS. + SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); + Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); + + // If src is zero (i.e. bsr sets ZF), returns NumBits. + SDValue Ops[] = { + Op, + DAG.getConstant(NumBits + NumBits - 1, dl, OpVT), + DAG.getConstant(X86::COND_E, dl, MVT::i8), + Op.getValue(1) + }; + Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops); + + // Finally xor with NumBits-1. + Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, + DAG.getConstant(NumBits - 1, dl, OpVT)); + + if (VT == MVT::i8) + Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); + return Op; +} + +static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + EVT OpVT = VT; + unsigned NumBits = VT.getSizeInBits(); + SDLoc dl(Op); + + Op = Op.getOperand(0); + if (VT == MVT::i8) { + // Zero extend to i32 since there is not an i8 bsr. + OpVT = MVT::i32; + Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); + } + + // Issue a bsr (scan bits in reverse). + SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); + Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); + + // And xor with NumBits-1. + Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, + DAG.getConstant(NumBits - 1, dl, OpVT)); + + if (VT == MVT::i8) + Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); + return Op; +} + +static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + unsigned NumBits = VT.getScalarSizeInBits(); + SDLoc dl(Op); + + if (VT.isVector()) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + SDValue N0 = Op.getOperand(0); + SDValue Zero = DAG.getConstant(0, dl, VT); + + // lsb(x) = (x & -x) + SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0, + DAG.getNode(ISD::SUB, dl, VT, Zero, N0)); + + // cttz_undef(x) = (width - 1) - ctlz(lsb) + if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF && + TLI.isOperationLegal(ISD::CTLZ, VT)) { + SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT); + return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne, + DAG.getNode(ISD::CTLZ, dl, VT, LSB)); + } + + // cttz(x) = ctpop(lsb - 1) + SDValue One = DAG.getConstant(1, dl, VT); + return DAG.getNode(ISD::CTPOP, dl, VT, + DAG.getNode(ISD::SUB, dl, VT, LSB, One)); + } + + assert(Op.getOpcode() == ISD::CTTZ && + "Only scalar CTTZ requires custom lowering"); + + // Issue a bsf (scan bits forward) which also sets EFLAGS. + SDVTList VTs = DAG.getVTList(VT, MVT::i32); + Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0)); + + // If src is zero (i.e. bsf sets ZF), returns NumBits. + SDValue Ops[] = { + Op, + DAG.getConstant(NumBits, dl, VT), + DAG.getConstant(X86::COND_E, dl, MVT::i8), + Op.getValue(1) + }; + return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); +} + +// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit +// ones, and then concatenate the result back. +static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + + assert(VT.is256BitVector() && VT.isInteger() && + "Unsupported value type for operation"); + + unsigned NumElems = VT.getVectorNumElements(); + SDLoc dl(Op); + + // Extract the LHS vectors + SDValue LHS = Op.getOperand(0); + SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl); + SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl); + + // Extract the RHS vectors + SDValue RHS = Op.getOperand(1); + SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl); + SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl); + + MVT EltVT = VT.getVectorElementType(); + MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); +} + +static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { + if (Op.getValueType() == MVT::i1) + return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(), + Op.getOperand(0), Op.getOperand(1)); + assert(Op.getSimpleValueType().is256BitVector() && + Op.getSimpleValueType().isInteger() && + "Only handle AVX 256-bit vector integer operation"); + return Lower256IntArith(Op, DAG); +} + +static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { + if (Op.getValueType() == MVT::i1) + return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(), + Op.getOperand(0), Op.getOperand(1)); + assert(Op.getSimpleValueType().is256BitVector() && + Op.getSimpleValueType().isInteger() && + "Only handle AVX 256-bit vector integer operation"); + return Lower256IntArith(Op, DAG); +} + +static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) { + assert(Op.getSimpleValueType().is256BitVector() && + Op.getSimpleValueType().isInteger() && + "Only handle AVX 256-bit vector integer operation"); + return Lower256IntArith(Op, DAG); +} + +static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + + if (VT == MVT::i1) + return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1)); + + // Decompose 256-bit ops into smaller 128-bit ops. + if (VT.is256BitVector() && !Subtarget->hasInt256()) + return Lower256IntArith(Op, DAG); + + SDValue A = Op.getOperand(0); + SDValue B = Op.getOperand(1); + + // Lower v16i8/v32i8 mul as promotion to v8i16/v16i16 vector + // pairs, multiply and truncate. + if (VT == MVT::v16i8 || VT == MVT::v32i8) { + if (Subtarget->hasInt256()) { + if (VT == MVT::v32i8) { + MVT SubVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() / 2); + SDValue Lo = DAG.getIntPtrConstant(0, dl); + SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl); + SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Lo); + SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Lo); + SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Hi); + SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Hi); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + DAG.getNode(ISD::MUL, dl, SubVT, ALo, BLo), + DAG.getNode(ISD::MUL, dl, SubVT, AHi, BHi)); + } + + MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); + return DAG.getNode( + ISD::TRUNCATE, dl, VT, + DAG.getNode(ISD::MUL, dl, ExVT, + DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A), + DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B))); + } + + assert(VT == MVT::v16i8 && + "Pre-AVX2 support only supports v16i8 multiplication"); + MVT ExVT = MVT::v8i16; + + // Extract the lo parts and sign extend to i16 + SDValue ALo, BLo; + if (Subtarget->hasSSE41()) { + ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A); + BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B); + } else { + const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3, + -1, 4, -1, 5, -1, 6, -1, 7}; + ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); + BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); + ALo = DAG.getBitcast(ExVT, ALo); + BLo = DAG.getBitcast(ExVT, BLo); + ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT)); + BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT)); + } + + // Extract the hi parts and sign extend to i16 + SDValue AHi, BHi; + if (Subtarget->hasSSE41()) { + const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1}; + AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); + BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); + AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi); + BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi); + } else { + const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11, + -1, 12, -1, 13, -1, 14, -1, 15}; + AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); + BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); + AHi = DAG.getBitcast(ExVT, AHi); + BHi = DAG.getBitcast(ExVT, BHi); + AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT)); + BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT)); + } + + // Multiply, mask the lower 8bits of the lo/hi results and pack + SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); + SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); + RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT)); + RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT)); + return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); + } + + // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. + if (VT == MVT::v4i32) { + assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() && + "Should not custom lower when pmuldq is available!"); + + // Extract the odd parts. + static const int UnpackMask[] = { 1, -1, 3, -1 }; + SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); + SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); + + // Multiply the even parts. + SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B); + // Now multiply odd parts. + SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds); + + Evens = DAG.getBitcast(VT, Evens); + Odds = DAG.getBitcast(VT, Odds); + + // Merge the two vectors back together with a shuffle. This expands into 2 + // shuffles. + static const int ShufMask[] = { 0, 4, 2, 6 }; + return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); + } + + assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && + "Only know how to lower V2I64/V4I64/V8I64 multiply"); + + // Ahi = psrlqi(a, 32); + // Bhi = psrlqi(b, 32); + // + // AloBlo = pmuludq(a, b); + // AloBhi = pmuludq(a, Bhi); + // AhiBlo = pmuludq(Ahi, b); + + // AloBhi = psllqi(AloBhi, 32); + // AhiBlo = psllqi(AhiBlo, 32); + // return AloBlo + AloBhi + AhiBlo; + + SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); + SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); + + SDValue AhiBlo = Ahi; + SDValue AloBhi = Bhi; + // Bit cast to 32-bit vectors for MULUDQ + MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : + (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32; + A = DAG.getBitcast(MulVT, A); + B = DAG.getBitcast(MulVT, B); + Ahi = DAG.getBitcast(MulVT, Ahi); + Bhi = DAG.getBitcast(MulVT, Bhi); + + SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); + // After shifting right const values the result may be all-zero. + if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) { + AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); + AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG); + } + if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) { + AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); + AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG); + } + + SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); + return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); +} + +SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const { + assert(Subtarget->isTargetWin64() && "Unexpected target"); + EVT VT = Op.getValueType(); + assert(VT.isInteger() && VT.getSizeInBits() == 128 && + "Unexpected return type for lowering"); + + RTLIB::Libcall LC; + bool isSigned; + switch (Op->getOpcode()) { + default: llvm_unreachable("Unexpected request for libcall!"); + case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break; + case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break; + case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break; + case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break; + case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break; + case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break; + } + + SDLoc dl(Op); + SDValue InChain = DAG.getEntryNode(); + + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { + EVT ArgVT = Op->getOperand(i).getValueType(); + assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && + "Unexpected argument type for lowering"); + SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16); + Entry.Node = StackPtr; + InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(), + false, false, 16); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + Entry.Ty = PointerType::get(ArgTy,0); + Entry.isSExt = false; + Entry.isZExt = false; + Args.push_back(Entry); + } + + SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), + getPointerTy(DAG.getDataLayout())); + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl).setChain(InChain) + .setCallee(getLibcallCallingConv(LC), + static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), + Callee, std::move(Args), 0) + .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); + + std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); + return DAG.getBitcast(VT, CallInfo.first); +} + +static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); + MVT VT = Op0.getSimpleValueType(); + SDLoc dl(Op); + + assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) || + (VT == MVT::v8i32 && Subtarget->hasInt256())); + + // PMULxD operations multiply each even value (starting at 0) of LHS with + // the related value of RHS and produce a widen result. + // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h> + // => <2 x i64> <ae|cg> + // + // In other word, to have all the results, we need to perform two PMULxD: + // 1. one with the even values. + // 2. one with the odd values. + // To achieve #2, with need to place the odd values at an even position. + // + // Place the odd value at an even position (basically, shift all values 1 + // step to the left): + const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1}; + // <a|b|c|d> => <b|undef|d|undef> + SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask); + // <e|f|g|h> => <f|undef|h|undef> + SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask); + + // Emit two multiplies, one for the lower 2 ints and one for the higher 2 + // ints. + MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64; + bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI; + unsigned Opcode = + (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; + // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h> + // => <2 x i64> <ae|cg> + SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1)); + // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef> + // => <2 x i64> <bf|dh> + SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1)); + + // Shuffle it back into the right order. + SDValue Highs, Lows; + if (VT == MVT::v8i32) { + const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15}; + Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); + const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14}; + Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + } else { + const int HighMask[] = {1, 5, 3, 7}; + Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); + const int LowMask[] = {0, 4, 2, 6}; + Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + } + + // If we have a signed multiply but no PMULDQ fix up the high parts of a + // unsigned multiply. + if (IsSigned && !Subtarget->hasSSE41()) { + SDValue ShAmt = DAG.getConstant( + 31, dl, + DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout())); + SDValue T1 = DAG.getNode(ISD::AND, dl, VT, + DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1); + SDValue T2 = DAG.getNode(ISD::AND, dl, VT, + DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0); + + SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2); + Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup); + } + + // The first result of MUL_LOHI is actually the low value, followed by the + // high value. + SDValue Ops[] = {Lows, Highs}; + return DAG.getMergeValues(Ops, dl); +} + +// Return true if the required (according to Opcode) shift-imm form is natively +// supported by the Subtarget +static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget, + unsigned Opcode) { + if (VT.getScalarSizeInBits() < 16) + return false; + + if (VT.is512BitVector() && + (VT.getScalarSizeInBits() > 16 || Subtarget->hasBWI())) + return true; + + bool LShift = VT.is128BitVector() || + (VT.is256BitVector() && Subtarget->hasInt256()); + + bool AShift = LShift && (Subtarget->hasVLX() || + (VT != MVT::v2i64 && VT != MVT::v4i64)); + return (Opcode == ISD::SRA) ? AShift : LShift; +} + +// The shift amount is a variable, but it is the same for all vector lanes. +// These instructions are defined together with shift-immediate. +static +bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget, + unsigned Opcode) { + return SupportedVectorShiftWithImm(VT, Subtarget, Opcode); +} + +// Return true if the required (according to Opcode) variable-shift form is +// natively supported by the Subtarget +static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget, + unsigned Opcode) { + + if (!Subtarget->hasInt256() || VT.getScalarSizeInBits() < 16) + return false; + + // vXi16 supported only on AVX-512, BWI + if (VT.getScalarSizeInBits() == 16 && !Subtarget->hasBWI()) + return false; + + if (VT.is512BitVector() || Subtarget->hasVLX()) + return true; + + bool LShift = VT.is128BitVector() || VT.is256BitVector(); + bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64; + return (Opcode == ISD::SRA) ? AShift : LShift; +} + +static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + MVT VT = Op.getSimpleValueType(); + SDLoc dl(Op); + SDValue R = Op.getOperand(0); + SDValue Amt = Op.getOperand(1); + + unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI : + (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI; + + auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) { + assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type"); + MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); + SDValue Ex = DAG.getBitcast(ExVT, R); + + if (ShiftAmt >= 32) { + // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32. + SDValue Upper = + getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG); + SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, + ShiftAmt - 32, DAG); + if (VT == MVT::v2i64) + Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3}); + if (VT == MVT::v4i64) + Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, + {9, 1, 11, 3, 13, 5, 15, 7}); + } else { + // SRA upper i32, SHL whole i64 and select lower i32. + SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, + ShiftAmt, DAG); + SDValue Lower = + getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG); + Lower = DAG.getBitcast(ExVT, Lower); + if (VT == MVT::v2i64) + Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3}); + if (VT == MVT::v4i64) + Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, + {8, 1, 10, 3, 12, 5, 14, 7}); + } + return DAG.getBitcast(VT, Ex); + }; + + // Optimize shl/srl/sra with constant shift amount. + if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) { + if (auto *ShiftConst = BVAmt->getConstantSplatNode()) { + uint64_t ShiftAmt = ShiftConst->getZExtValue(); + + if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) + return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); + + // i64 SRA needs to be performed as partial shifts. + if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && + Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP()) + return ArithmeticShiftRight64(ShiftAmt); + + if (VT == MVT::v16i8 || + (Subtarget->hasInt256() && VT == MVT::v32i8) || + VT == MVT::v64i8) { + unsigned NumElts = VT.getVectorNumElements(); + MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); + + // Simple i8 add case + if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) + return DAG.getNode(ISD::ADD, dl, VT, R, R); + + // ashr(R, 7) === cmp_slt(R, 0) + if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) { + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); + } + + // XOP can shift v16i8 directly instead of as shift v8i16 + mask. + if (VT == MVT::v16i8 && Subtarget->hasXOP()) + return SDValue(); + + if (Op.getOpcode() == ISD::SHL) { + // Make a large shift. + SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, + R, ShiftAmt, DAG); + SHL = DAG.getBitcast(VT, SHL); + // Zero out the rightmost bits. + return DAG.getNode(ISD::AND, dl, VT, SHL, + DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT)); + } + if (Op.getOpcode() == ISD::SRL) { + // Make a large shift. + SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, + R, ShiftAmt, DAG); + SRL = DAG.getBitcast(VT, SRL); + // Zero out the leftmost bits. + return DAG.getNode(ISD::AND, dl, VT, SRL, + DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT)); + } + if (Op.getOpcode() == ISD::SRA) { + // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask) + SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); + + SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT); + Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); + Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); + return Res; + } + llvm_unreachable("Unknown shift opcode."); + } + } + } + + // Special case in 32-bit mode, where i64 is expanded into high and low parts. + if (!Subtarget->is64Bit() && !Subtarget->hasXOP() && + (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64))) { + + // Peek through any splat that was introduced for i64 shift vectorization. + int SplatIndex = -1; + if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode())) + if (SVN->isSplat()) { + SplatIndex = SVN->getSplatIndex(); + Amt = Amt.getOperand(0); + assert(SplatIndex < (int)VT.getVectorNumElements() && + "Splat shuffle referencing second operand"); + } + + if (Amt.getOpcode() != ISD::BITCAST || + Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + Amt = Amt.getOperand(0); + unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / + VT.getVectorNumElements(); + unsigned RatioInLog2 = Log2_32_Ceil(Ratio); + uint64_t ShiftAmt = 0; + unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio); + for (unsigned i = 0; i != Ratio; ++i) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp)); + if (!C) + return SDValue(); + // 6 == Log2(64) + ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); + } + + // Check remaining shift amounts (if not a splat). + if (SplatIndex < 0) { + for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { + uint64_t ShAmt = 0; + for (unsigned j = 0; j != Ratio; ++j) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j)); + if (!C) + return SDValue(); + // 6 == Log2(64) + ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); + } + if (ShAmt != ShiftAmt) + return SDValue(); + } + } + + if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) + return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); + + if (Op.getOpcode() == ISD::SRA) + return ArithmeticShiftRight64(ShiftAmt); + } + + return SDValue(); +} + +static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, + const X86Subtarget* Subtarget) { + MVT VT = Op.getSimpleValueType(); + SDLoc dl(Op); + SDValue R = Op.getOperand(0); + SDValue Amt = Op.getOperand(1); + + unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI : + (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI; + + unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL : + (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA; + + if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) { + SDValue BaseShAmt; + MVT EltVT = VT.getVectorElementType(); + + if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) { + // Check if this build_vector node is doing a splat. + // If so, then set BaseShAmt equal to the splat value. + BaseShAmt = BV->getSplatValue(); + if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF) + BaseShAmt = SDValue(); + } else { + if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) + Amt = Amt.getOperand(0); + + ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt); + if (SVN && SVN->isSplat()) { + unsigned SplatIdx = (unsigned)SVN->getSplatIndex(); + SDValue InVec = Amt.getOperand(0); + if (InVec.getOpcode() == ISD::BUILD_VECTOR) { + assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) && + "Unexpected shuffle index found!"); + BaseShAmt = InVec.getOperand(SplatIdx); + } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { + if (ConstantSDNode *C = + dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { + if (C->getZExtValue() == SplatIdx) + BaseShAmt = InVec.getOperand(1); + } + } + + if (!BaseShAmt) + // Avoid introducing an extract element from a shuffle. + BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec, + DAG.getIntPtrConstant(SplatIdx, dl)); + } + } + + if (BaseShAmt.getNode()) { + assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!"); + if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32)) + BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt); + else if (EltVT.bitsLT(MVT::i32)) + BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); + + return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG); + } + } + + // Special case in 32-bit mode, where i64 is expanded into high and low parts. + if (!Subtarget->is64Bit() && VT == MVT::v2i64 && + Amt.getOpcode() == ISD::BITCAST && + Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + Amt = Amt.getOperand(0); + unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / + VT.getVectorNumElements(); + std::vector<SDValue> Vals(Ratio); + for (unsigned i = 0; i != Ratio; ++i) + Vals[i] = Amt.getOperand(i); + for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { + for (unsigned j = 0; j != Ratio; ++j) + if (Vals[j] != Amt.getOperand(i + j)) + return SDValue(); + } + + if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) + return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1)); + } + return SDValue(); +} + +static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + SDLoc dl(Op); + SDValue R = Op.getOperand(0); + SDValue Amt = Op.getOperand(1); + + assert(VT.isVector() && "Custom lowering only for vector shifts!"); + assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!"); + + if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget)) + return V; + + if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget)) + return V; + + if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode())) + return Op; + + // XOP has 128-bit variable logical/arithmetic shifts. + // +ve/-ve Amt = shift left/right. + if (Subtarget->hasXOP() && + (VT == MVT::v2i64 || VT == MVT::v4i32 || + VT == MVT::v8i16 || VT == MVT::v16i8)) { + if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) { + SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl); + Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt); + } + if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) + return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt); + if (Op.getOpcode() == ISD::SRA) + return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt); + } + + // 2i64 vector logical shifts can efficiently avoid scalarization - do the + // shifts per-lane and then shuffle the partial results back together. + if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) { + // Splat the shift amounts so the scalar shifts above will catch it. + SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0}); + SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1}); + SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0); + SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1); + return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); + } + + // i64 vector arithmetic shift can be emulated with the transform: + // M = lshr(SIGN_BIT, Amt) + // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M) + if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget->hasInt256())) && + Op.getOpcode() == ISD::SRA) { + SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT); + SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt); + R = DAG.getNode(ISD::SRL, dl, VT, R, Amt); + R = DAG.getNode(ISD::XOR, dl, VT, R, M); + R = DAG.getNode(ISD::SUB, dl, VT, R, M); + return R; + } + + // If possible, lower this packed shift into a vector multiply instead of + // expanding it into a sequence of scalar shifts. + // Do this only if the vector shift count is a constant build_vector. + if (Op.getOpcode() == ISD::SHL && + (VT == MVT::v8i16 || VT == MVT::v4i32 || + (Subtarget->hasInt256() && VT == MVT::v16i16)) && + ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { + SmallVector<SDValue, 8> Elts; + MVT SVT = VT.getVectorElementType(); + unsigned SVTBits = SVT.getSizeInBits(); + APInt One(SVTBits, 1); + unsigned NumElems = VT.getVectorNumElements(); + + for (unsigned i=0; i !=NumElems; ++i) { + SDValue Op = Amt->getOperand(i); + if (Op->getOpcode() == ISD::UNDEF) { + Elts.push_back(Op); + continue; + } + + ConstantSDNode *ND = cast<ConstantSDNode>(Op); + APInt C(SVTBits, ND->getAPIntValue().getZExtValue()); + uint64_t ShAmt = C.getZExtValue(); + if (ShAmt >= SVTBits) { + Elts.push_back(DAG.getUNDEF(SVT)); + continue; + } + Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT)); + } + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts); + return DAG.getNode(ISD::MUL, dl, VT, R, BV); + } + + // Lower SHL with variable shift amount. + if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { + Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT)); + + Op = DAG.getNode(ISD::ADD, dl, VT, Op, + DAG.getConstant(0x3f800000U, dl, VT)); + Op = DAG.getBitcast(MVT::v4f32, Op); + Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); + return DAG.getNode(ISD::MUL, dl, VT, Op, R); + } + + // If possible, lower this shift as a sequence of two shifts by + // constant plus a MOVSS/MOVSD instead of scalarizing it. + // Example: + // (v4i32 (srl A, (build_vector < X, Y, Y, Y>))) + // + // Could be rewritten as: + // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>))) + // + // The advantage is that the two shifts from the example would be + // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing + // the vector shift into four scalar shifts plus four pairs of vector + // insert/extract. + if ((VT == MVT::v8i16 || VT == MVT::v4i32) && + ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { + unsigned TargetOpcode = X86ISD::MOVSS; + bool CanBeSimplified; + // The splat value for the first packed shift (the 'X' from the example). + SDValue Amt1 = Amt->getOperand(0); + // The splat value for the second packed shift (the 'Y' from the example). + SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : + Amt->getOperand(2); + + // See if it is possible to replace this node with a sequence of + // two shifts followed by a MOVSS/MOVSD + if (VT == MVT::v4i32) { + // Check if it is legal to use a MOVSS. + CanBeSimplified = Amt2 == Amt->getOperand(2) && + Amt2 == Amt->getOperand(3); + if (!CanBeSimplified) { + // Otherwise, check if we can still simplify this node using a MOVSD. + CanBeSimplified = Amt1 == Amt->getOperand(1) && + Amt->getOperand(2) == Amt->getOperand(3); + TargetOpcode = X86ISD::MOVSD; + Amt2 = Amt->getOperand(2); + } + } else { + // Do similar checks for the case where the machine value type + // is MVT::v8i16. + CanBeSimplified = Amt1 == Amt->getOperand(1); + for (unsigned i=3; i != 8 && CanBeSimplified; ++i) + CanBeSimplified = Amt2 == Amt->getOperand(i); + + if (!CanBeSimplified) { + TargetOpcode = X86ISD::MOVSD; + CanBeSimplified = true; + Amt2 = Amt->getOperand(4); + for (unsigned i=0; i != 4 && CanBeSimplified; ++i) + CanBeSimplified = Amt1 == Amt->getOperand(i); + for (unsigned j=4; j != 8 && CanBeSimplified; ++j) + CanBeSimplified = Amt2 == Amt->getOperand(j); + } + } + + if (CanBeSimplified && isa<ConstantSDNode>(Amt1) && + isa<ConstantSDNode>(Amt2)) { + // Replace this node with two shifts followed by a MOVSS/MOVSD. + MVT CastVT = MVT::v4i32; + SDValue Splat1 = + DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT); + SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); + SDValue Splat2 = + DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT); + SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); + if (TargetOpcode == X86ISD::MOVSD) + CastVT = MVT::v2i64; + SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1); + SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2); + SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2, + BitCast1, DAG); + return DAG.getBitcast(VT, Result); + } + } + + // v4i32 Non Uniform Shifts. + // If the shift amount is constant we can shift each lane using the SSE2 + // immediate shifts, else we need to zero-extend each lane to the lower i64 + // and shift using the SSE2 variable shifts. + // The separate results can then be blended together. + if (VT == MVT::v4i32) { + unsigned Opc = Op.getOpcode(); + SDValue Amt0, Amt1, Amt2, Amt3; + if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { + Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0}); + Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1}); + Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2}); + Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3}); + } else { + // ISD::SHL is handled above but we include it here for completeness. + switch (Opc) { + default: + llvm_unreachable("Unknown target vector shift node"); + case ISD::SHL: + Opc = X86ISD::VSHL; + break; + case ISD::SRL: + Opc = X86ISD::VSRL; + break; + case ISD::SRA: + Opc = X86ISD::VSRA; + break; + } + // The SSE2 shifts use the lower i64 as the same shift amount for + // all lanes and the upper i64 is ignored. These shuffle masks + // optimally zero-extend each lanes on SSE2/SSE41/AVX targets. + SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); + Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1}); + Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1}); + Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1}); + Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1}); + } + + SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0); + SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1); + SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2); + SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3); + SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1}); + SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7}); + return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7}); + } + + if (VT == MVT::v16i8 || + (VT == MVT::v32i8 && Subtarget->hasInt256() && !Subtarget->hasXOP())) { + MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); + unsigned ShiftOpcode = Op->getOpcode(); + + auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { + // On SSE41 targets we make use of the fact that VSELECT lowers + // to PBLENDVB which selects bytes based just on the sign bit. + if (Subtarget->hasSSE41()) { + V0 = DAG.getBitcast(VT, V0); + V1 = DAG.getBitcast(VT, V1); + Sel = DAG.getBitcast(VT, Sel); + return DAG.getBitcast(SelVT, + DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1)); + } + // On pre-SSE41 targets we test for the sign bit by comparing to + // zero - a negative value will set all bits of the lanes to true + // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering. + SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl); + SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel); + return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1); + }; + + // Turn 'a' into a mask suitable for VSELECT: a = a << 5; + // We can safely do this using i16 shifts as we're only interested in + // the 3 lower bits of each byte. + Amt = DAG.getBitcast(ExtVT, Amt); + Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT)); + Amt = DAG.getBitcast(VT, Amt); + + if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) { + // r = VSELECT(r, shift(r, 4), a); + SDValue M = + DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT)); + R = SignBitSelect(VT, Amt, M, R); + + // a += a + Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); + + // r = VSELECT(r, shift(r, 2), a); + M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT)); + R = SignBitSelect(VT, Amt, M, R); + + // a += a + Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); + + // return VSELECT(r, shift(r, 1), a); + M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT)); + R = SignBitSelect(VT, Amt, M, R); + return R; + } + + if (Op->getOpcode() == ISD::SRA) { + // For SRA we need to unpack each byte to the higher byte of a i16 vector + // so we can correctly sign extend. We don't care what happens to the + // lower byte. + SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt); + SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt); + SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R); + SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R); + ALo = DAG.getBitcast(ExtVT, ALo); + AHi = DAG.getBitcast(ExtVT, AHi); + RLo = DAG.getBitcast(ExtVT, RLo); + RHi = DAG.getBitcast(ExtVT, RHi); + + // r = VSELECT(r, shift(r, 4), a); + SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, + DAG.getConstant(4, dl, ExtVT)); + SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, + DAG.getConstant(4, dl, ExtVT)); + RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); + RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); + + // a += a + ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); + AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); + + // r = VSELECT(r, shift(r, 2), a); + MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, + DAG.getConstant(2, dl, ExtVT)); + MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, + DAG.getConstant(2, dl, ExtVT)); + RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); + RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); + + // a += a + ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); + AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); + + // r = VSELECT(r, shift(r, 1), a); + MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo, + DAG.getConstant(1, dl, ExtVT)); + MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi, + DAG.getConstant(1, dl, ExtVT)); + RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); + RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); + + // Logical shift the result back to the lower byte, leaving a zero upper + // byte + // meaning that we can safely pack with PACKUSWB. + RLo = + DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT)); + RHi = + DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT)); + return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); + } + } + + // It's worth extending once and using the v8i32 shifts for 16-bit types, but + // the extra overheads to get from v16i8 to v8i32 make the existing SSE + // solution better. + if (Subtarget->hasInt256() && VT == MVT::v8i16) { + MVT ExtVT = MVT::v8i32; + unsigned ExtOpc = + Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + R = DAG.getNode(ExtOpc, dl, ExtVT, R); + Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt); + return DAG.getNode(ISD::TRUNCATE, dl, VT, + DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt)); + } + + if (Subtarget->hasInt256() && !Subtarget->hasXOP() && VT == MVT::v16i16) { + MVT ExtVT = MVT::v8i32; + SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); + SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z); + SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z); + SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, R, R); + SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, R, R); + ALo = DAG.getBitcast(ExtVT, ALo); + AHi = DAG.getBitcast(ExtVT, AHi); + RLo = DAG.getBitcast(ExtVT, RLo); + RHi = DAG.getBitcast(ExtVT, RHi); + SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo); + SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi); + Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT)); + Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT)); + return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); + } + + if (VT == MVT::v8i16) { + unsigned ShiftOpcode = Op->getOpcode(); + + auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) { + // On SSE41 targets we make use of the fact that VSELECT lowers + // to PBLENDVB which selects bytes based just on the sign bit. + if (Subtarget->hasSSE41()) { + MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2); + V0 = DAG.getBitcast(ExtVT, V0); + V1 = DAG.getBitcast(ExtVT, V1); + Sel = DAG.getBitcast(ExtVT, Sel); + return DAG.getBitcast( + VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1)); + } + // On pre-SSE41 targets we splat the sign bit - a negative value will + // set all bits of the lanes to true and VSELECT uses that in + // its OR(AND(V0,C),AND(V1,~C)) lowering. + SDValue C = + DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT)); + return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1); + }; + + // Turn 'a' into a mask suitable for VSELECT: a = a << 12; + if (Subtarget->hasSSE41()) { + // On SSE41 targets we need to replicate the shift mask in both + // bytes for PBLENDVB. + Amt = DAG.getNode( + ISD::OR, dl, VT, + DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)), + DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT))); + } else { + Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)); + } + + // r = VSELECT(r, shift(r, 8), a); + SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT)); + R = SignBitSelect(Amt, M, R); + + // a += a + Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); + + // r = VSELECT(r, shift(r, 4), a); + M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT)); + R = SignBitSelect(Amt, M, R); + + // a += a + Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); + + // r = VSELECT(r, shift(r, 2), a); + M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT)); + R = SignBitSelect(Amt, M, R); + + // a += a + Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); + + // return VSELECT(r, shift(r, 1), a); + M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT)); + R = SignBitSelect(Amt, M, R); + return R; + } + + // Decompose 256-bit shifts into smaller 128-bit shifts. + if (VT.is256BitVector()) { + unsigned NumElems = VT.getVectorNumElements(); + MVT EltVT = VT.getVectorElementType(); + MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + + // Extract the two vectors + SDValue V1 = Extract128BitVector(R, 0, DAG, dl); + SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl); + + // Recreate the shift amount vectors + SDValue Amt1, Amt2; + if (Amt.getOpcode() == ISD::BUILD_VECTOR) { + // Constant shift amount + SmallVector<SDValue, 8> Ops(Amt->op_begin(), Amt->op_begin() + NumElems); + ArrayRef<SDValue> Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2); + ArrayRef<SDValue> Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2); + + Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts); + Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts); + } else { + // Variable shift amount + Amt1 = Extract128BitVector(Amt, 0, DAG, dl); + Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl); + } + + // Issue new vector shifts for the smaller types + V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); + V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); + + // Concatenate the result back + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); + } + + return SDValue(); +} + +static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + SDLoc DL(Op); + SDValue R = Op.getOperand(0); + SDValue Amt = Op.getOperand(1); + + assert(VT.isVector() && "Custom lowering only for vector rotates!"); + assert(Subtarget->hasXOP() && "XOP support required for vector rotates!"); + assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported"); + + // XOP has 128-bit vector variable + immediate rotates. + // +ve/-ve Amt = rotate left/right. + + // Split 256-bit integers. + if (VT.is256BitVector()) + return Lower256IntArith(Op, DAG); + + assert(VT.is128BitVector() && "Only rotate 128-bit vectors!"); + + // Attempt to rotate by immediate. + if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) { + if (auto *RotateConst = BVAmt->getConstantSplatNode()) { + uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue(); + assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range"); + return DAG.getNode(X86ISD::VPROTI, DL, VT, R, + DAG.getConstant(RotateAmt, DL, MVT::i8)); + } + } + + // Use general rotate by variable (per-element). + return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt); +} + +static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { + // Lower the "add/sub/mul with overflow" instruction into a regular ins plus + // a "setcc" instruction that checks the overflow flag. The "brcond" lowering + // looks for this combo and may remove the "setcc" instruction if the "setcc" + // has only one use. + SDNode *N = Op.getNode(); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + unsigned BaseOp = 0; + unsigned Cond = 0; + SDLoc DL(Op); + switch (Op.getOpcode()) { + default: llvm_unreachable("Unknown ovf instruction!"); + case ISD::SADDO: + // A subtract of one will be selected as a INC. Note that INC doesn't + // set CF, so we can't do this for UADDO. + if (isOneConstant(RHS)) { + BaseOp = X86ISD::INC; + Cond = X86::COND_O; + break; + } + BaseOp = X86ISD::ADD; + Cond = X86::COND_O; + break; + case ISD::UADDO: + BaseOp = X86ISD::ADD; + Cond = X86::COND_B; + break; + case ISD::SSUBO: + // A subtract of one will be selected as a DEC. Note that DEC doesn't + // set CF, so we can't do this for USUBO. + if (isOneConstant(RHS)) { + BaseOp = X86ISD::DEC; + Cond = X86::COND_O; + break; + } + BaseOp = X86ISD::SUB; + Cond = X86::COND_O; + break; + case ISD::USUBO: + BaseOp = X86ISD::SUB; + Cond = X86::COND_B; + break; + case ISD::SMULO: + BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL; + Cond = X86::COND_O; + break; + case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs + if (N->getValueType(0) == MVT::i8) { + BaseOp = X86ISD::UMUL8; + Cond = X86::COND_O; + break; + } + SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), + MVT::i32); + SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); + + SDValue SetCC = + DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(X86::COND_O, DL, MVT::i32), + SDValue(Sum.getNode(), 2)); + + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); + } + } + + // Also sets EFLAGS. + SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); + SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); + + SDValue SetCC = + DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), + DAG.getConstant(Cond, DL, MVT::i32), + SDValue(Sum.getNode(), 1)); + + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); +} + +/// Returns true if the operand type is exactly twice the native width, and +/// the corresponding cmpxchg8b or cmpxchg16b instruction is available. +/// Used to know whether to use cmpxchg8/16b when expanding atomic operations +/// (otherwise we leave them alone to become __sync_fetch_and_... calls). +bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { + unsigned OpWidth = MemType->getPrimitiveSizeInBits(); + + if (OpWidth == 64) + return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b + else if (OpWidth == 128) + return Subtarget->hasCmpxchg16b(); + else + return false; +} + +bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { + return needsCmpXchgNb(SI->getValueOperand()->getType()); +} + +// Note: this turns large loads into lock cmpxchg8b/16b. +// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. +TargetLowering::AtomicExpansionKind +X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { + auto PTy = cast<PointerType>(LI->getPointerOperand()->getType()); + return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg + : AtomicExpansionKind::None; +} + +TargetLowering::AtomicExpansionKind +X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; + Type *MemType = AI->getType(); + + // If the operand is too big, we must see if cmpxchg8/16b is available + // and default to library calls otherwise. + if (MemType->getPrimitiveSizeInBits() > NativeWidth) { + return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg + : AtomicExpansionKind::None; + } + + AtomicRMWInst::BinOp Op = AI->getOperation(); + switch (Op) { + default: + llvm_unreachable("Unknown atomic operation"); + case AtomicRMWInst::Xchg: + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + // It's better to use xadd, xsub or xchg for these in all cases. + return AtomicExpansionKind::None; + case AtomicRMWInst::Or: + case AtomicRMWInst::And: + case AtomicRMWInst::Xor: + // If the atomicrmw's result isn't actually used, we can just add a "lock" + // prefix to a normal instruction for these operations. + return !AI->use_empty() ? AtomicExpansionKind::CmpXChg + : AtomicExpansionKind::None; + case AtomicRMWInst::Nand: + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: + // These always require a non-trivial set of data operations on x86. We must + // use a cmpxchg loop. + return AtomicExpansionKind::CmpXChg; + } +} + +static bool hasMFENCE(const X86Subtarget& Subtarget) { + // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for + // no-sse2). There isn't any reason to disable it if the target processor + // supports it. + return Subtarget.hasSSE2() || Subtarget.is64Bit(); +} + +LoadInst * +X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { + unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; + Type *MemType = AI->getType(); + // Accesses larger than the native width are turned into cmpxchg/libcalls, so + // there is no benefit in turning such RMWs into loads, and it is actually + // harmful as it introduces a mfence. + if (MemType->getPrimitiveSizeInBits() > NativeWidth) + return nullptr; + + auto Builder = IRBuilder<>(AI); + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + auto SynchScope = AI->getSynchScope(); + // We must restrict the ordering to avoid generating loads with Release or + // ReleaseAcquire orderings. + auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); + auto Ptr = AI->getPointerOperand(); + + // Before the load we need a fence. Here is an example lifted from + // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence + // is required: + // Thread 0: + // x.store(1, relaxed); + // r1 = y.fetch_add(0, release); + // Thread 1: + // y.fetch_add(42, acquire); + // r2 = x.load(relaxed); + // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is + // lowered to just a load without a fence. A mfence flushes the store buffer, + // making the optimization clearly correct. + // FIXME: it is required if isAtLeastRelease(Order) but it is not clear + // otherwise, we might be able to be more aggressive on relaxed idempotent + // rmw. In practice, they do not look useful, so we don't try to be + // especially clever. + if (SynchScope == SingleThread) + // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at + // the IR level, so we must wrap it in an intrinsic. + return nullptr; + + if (!hasMFENCE(*Subtarget)) + // FIXME: it might make sense to use a locked operation here but on a + // different cache-line to prevent cache-line bouncing. In practice it + // is probably a small win, and x86 processors without mfence are rare + // enough that we do not bother. + return nullptr; + + Function *MFence = + llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence); + Builder.CreateCall(MFence, {}); + + // Finally we can emit the atomic load. + LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr, + AI->getType()->getPrimitiveSizeInBits()); + Loaded->setAtomic(Order, SynchScope); + AI->replaceAllUsesWith(Loaded); + AI->eraseFromParent(); + return Loaded; +} + +static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc dl(Op); + AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( + cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); + SynchronizationScope FenceScope = static_cast<SynchronizationScope>( + cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); + + // The only fence that needs an instruction is a sequentially-consistent + // cross-thread fence. + if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { + if (hasMFENCE(*Subtarget)) + return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); + + SDValue Chain = Op.getOperand(0); + SDValue Zero = DAG.getConstant(0, dl, MVT::i32); + SDValue Ops[] = { + DAG.getRegister(X86::ESP, MVT::i32), // Base + DAG.getTargetConstant(1, dl, MVT::i8), // Scale + DAG.getRegister(0, MVT::i32), // Index + DAG.getTargetConstant(0, dl, MVT::i32), // Disp + DAG.getRegister(0, MVT::i32), // Segment. + Zero, + Chain + }; + SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops); + return SDValue(Res, 0); + } + + // MEMBARRIER is a compiler barrier; it codegens to a no-op. + return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); +} + +static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT T = Op.getSimpleValueType(); + SDLoc DL(Op); + unsigned Reg = 0; + unsigned size = 0; + switch(T.SimpleTy) { + default: llvm_unreachable("Invalid value type!"); + case MVT::i8: Reg = X86::AL; size = 1; break; + case MVT::i16: Reg = X86::AX; size = 2; break; + case MVT::i32: Reg = X86::EAX; size = 4; break; + case MVT::i64: + assert(Subtarget->is64Bit() && "Node not type legal!"); + Reg = X86::RAX; size = 8; + break; + } + SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, + Op.getOperand(2), SDValue()); + SDValue Ops[] = { cpIn.getValue(0), + Op.getOperand(1), + Op.getOperand(3), + DAG.getTargetConstant(size, DL, MVT::i8), + cpIn.getValue(1) }; + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); + SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, + Ops, T, MMO); + + SDValue cpOut = + DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); + SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS, + MVT::i32, cpOut.getValue(2)); + SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1), + DAG.getConstant(X86::COND_E, DL, MVT::i8), + EFLAGS); + + DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut); + DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success); + DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1)); + return SDValue(); +} + +static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT SrcVT = Op.getOperand(0).getSimpleValueType(); + MVT DstVT = Op.getSimpleValueType(); + + if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) { + assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + if (DstVT != MVT::f64) + // This conversion needs to be expanded. + return SDValue(); + + SDValue InVec = Op->getOperand(0); + SDLoc dl(Op); + unsigned NumElts = SrcVT.getVectorNumElements(); + MVT SVT = SrcVT.getVectorElementType(); + + // Widen the vector in input in the case of MVT::v2i32. + // Example: from MVT::v2i32 to MVT::v4i32. + SmallVector<SDValue, 16> Elts; + for (unsigned i = 0, e = NumElts; i != e; ++i) + Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec, + DAG.getIntPtrConstant(i, dl))); + + // Explicitly mark the extra elements as Undef. + Elts.append(NumElts, DAG.getUNDEF(SVT)); + + EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts); + SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64, + DAG.getIntPtrConstant(0, dl)); + } + + assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && + Subtarget->hasMMX() && "Unexpected custom BITCAST"); + assert((DstVT == MVT::i64 || + (DstVT.isVector() && DstVT.getSizeInBits()==64)) && + "Unexpected custom BITCAST"); + // i64 <=> MMX conversions are Legal. + if (SrcVT==MVT::i64 && DstVT.isVector()) + return Op; + if (DstVT==MVT::i64 && SrcVT.isVector()) + return Op; + // MMX <=> MMX conversions are Legal. + if (SrcVT.isVector() && DstVT.isVector()) + return Op; + // All other conversions need to be expanded. + return SDValue(); +} + +/// Compute the horizontal sum of bytes in V for the elements of VT. +/// +/// Requires V to be a byte vector and VT to be an integer vector type with +/// wider elements than V's type. The width of the elements of VT determines +/// how many bytes of V are summed horizontally to produce each element of the +/// result. +static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(V); + MVT ByteVecVT = V.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + int NumElts = VT.getVectorNumElements(); + assert(ByteVecVT.getVectorElementType() == MVT::i8 && + "Expected value to have byte element type."); + assert(EltVT != MVT::i8 && + "Horizontal byte sum only makes sense for wider elements!"); + unsigned VecSize = VT.getSizeInBits(); + assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!"); + + // PSADBW instruction horizontally add all bytes and leave the result in i64 + // chunks, thus directly computes the pop count for v2i64 and v4i64. + if (EltVT == MVT::i64) { + SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); + MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); + V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros); + return DAG.getBitcast(VT, V); + } + + if (EltVT == MVT::i32) { + // We unpack the low half and high half into i32s interleaved with zeros so + // that we can use PSADBW to horizontally sum them. The most useful part of + // this is that it lines up the results of two PSADBW instructions to be + // two v2i64 vectors which concatenated are the 4 population counts. We can + // then use PACKUSWB to shrink and concatenate them into a v4i32 again. + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL); + SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros); + SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros); + + // Do the horizontal sums into two v2i64s. + Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); + MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); + Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, + DAG.getBitcast(ByteVecVT, Low), Zeros); + High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, + DAG.getBitcast(ByteVecVT, High), Zeros); + + // Merge them together. + MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16); + V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT, + DAG.getBitcast(ShortVecVT, Low), + DAG.getBitcast(ShortVecVT, High)); + + return DAG.getBitcast(VT, V); + } + + // The only element type left is i16. + assert(EltVT == MVT::i16 && "Unknown how to handle type"); + + // To obtain pop count for each i16 element starting from the pop count for + // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s + // right by 8. It is important to shift as i16s as i8 vector shift isn't + // directly supported. + SmallVector<SDValue, 16> Shifters(NumElts, DAG.getConstant(8, DL, EltVT)); + SDValue Shifter = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters); + SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), Shifter); + V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl), + DAG.getBitcast(ByteVecVT, V)); + return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), Shifter); +} + +static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + unsigned VecSize = VT.getSizeInBits(); + + // Implement a lookup table in register by using an algorithm based on: + // http://wm.ite.pl/articles/sse-popcount.html + // + // The general idea is that every lower byte nibble in the input vector is an + // index into a in-register pre-computed pop count table. We then split up the + // input vector in two new ones: (1) a vector with only the shifted-right + // higher nibbles for each byte and (2) a vector with the lower nibbles (and + // masked out higher ones) for each byte. PSHUB is used separately with both + // to index the in-register table. Next, both are added and the result is a + // i8 vector where each element contains the pop count for input byte. + // + // To obtain the pop count for elements != i8, we follow up with the same + // approach and use additional tricks as described below. + // + const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, + /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, + /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, + /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4}; + + int NumByteElts = VecSize / 8; + MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts); + SDValue In = DAG.getBitcast(ByteVecVT, Op); + SmallVector<SDValue, 16> LUTVec; + for (int i = 0; i < NumByteElts; ++i) + LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); + SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec); + SmallVector<SDValue, 16> Mask0F(NumByteElts, + DAG.getConstant(0x0F, DL, MVT::i8)); + SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Mask0F); + + // High nibbles + SmallVector<SDValue, 16> Four(NumByteElts, DAG.getConstant(4, DL, MVT::i8)); + SDValue FourV = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Four); + SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV); + + // Low nibbles + SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F); + + // The input vector is used as the shuffle mask that index elements into the + // LUT. After counting low and high nibbles, add the vector to obtain the + // final pop count per i8 element. + SDValue HighPopCnt = + DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles); + SDValue LowPopCnt = + DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles); + SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt); + + if (EltVT == MVT::i8) + return PopCnt; + + return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG); +} + +static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + assert(VT.is128BitVector() && + "Only 128-bit vector bitmath lowering supported."); + + int VecSize = VT.getSizeInBits(); + MVT EltVT = VT.getVectorElementType(); + int Len = EltVT.getSizeInBits(); + + // This is the vectorized version of the "best" algorithm from + // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + // with a minor tweak to use a series of adds + shifts instead of vector + // multiplications. Implemented for all integer vector types. We only use + // this when we don't have SSSE3 which allows a LUT-based lowering that is + // much faster, even faster than using native popcnt instructions. + + auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) { + MVT VT = V.getSimpleValueType(); + SmallVector<SDValue, 32> Shifters( + VT.getVectorNumElements(), + DAG.getConstant(Shifter, DL, VT.getVectorElementType())); + return DAG.getNode(OpCode, DL, VT, V, + DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Shifters)); + }; + auto GetMask = [&](SDValue V, APInt Mask) { + MVT VT = V.getSimpleValueType(); + SmallVector<SDValue, 32> Masks( + VT.getVectorNumElements(), + DAG.getConstant(Mask, DL, VT.getVectorElementType())); + return DAG.getNode(ISD::AND, DL, VT, V, + DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Masks)); + }; + + // We don't want to incur the implicit masks required to SRL vNi8 vectors on + // x86, so set the SRL type to have elements at least i16 wide. This is + // correct because all of our SRLs are followed immediately by a mask anyways + // that handles any bits that sneak into the high bits of the byte elements. + MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16); + + SDValue V = Op; + + // v = v - ((v >> 1) & 0x55555555...) + SDValue Srl = + DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1)); + SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55))); + V = DAG.getNode(ISD::SUB, DL, VT, V, And); + + // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) + SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33))); + Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2)); + SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33))); + V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS); + + // v = (v + (v >> 4)) & 0x0F0F0F0F... + Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4)); + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl); + V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F))); + + // At this point, V contains the byte-wise population count, and we are + // merely doing a horizontal sum if necessary to get the wider element + // counts. + if (EltVT == MVT::i8) + return V; + + return LowerHorizontalByteSum( + DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget, + DAG); +} + +static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + // FIXME: Need to add AVX-512 support here! + assert((VT.is256BitVector() || VT.is128BitVector()) && + "Unknown CTPOP type to handle"); + SDLoc DL(Op.getNode()); + SDValue Op0 = Op.getOperand(0); + + if (!Subtarget->hasSSSE3()) { + // We can't use the fast LUT approach, so fall back on vectorized bitmath. + assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!"); + return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG); + } + + if (VT.is256BitVector() && !Subtarget->hasInt256()) { + unsigned NumElems = VT.getVectorNumElements(); + + // Extract each 128-bit vector, compute pop count and concat the result. + SDValue LHS = Extract128BitVector(Op0, 0, DAG, DL); + SDValue RHS = Extract128BitVector(Op0, NumElems/2, DAG, DL); + + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, + LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG), + LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG)); + } + + return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG); +} + +static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(Op.getSimpleValueType().isVector() && + "We only do custom lowering for vector population count."); + return LowerVectorCTPOP(Op, Subtarget, DAG); +} + +static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { + SDNode *Node = Op.getNode(); + SDLoc dl(Node); + EVT T = Node->getValueType(0); + SDValue negOp = DAG.getNode(ISD::SUB, dl, T, + DAG.getConstant(0, dl, T), Node->getOperand(2)); + return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, + cast<AtomicSDNode>(Node)->getMemoryVT(), + Node->getOperand(0), + Node->getOperand(1), negOp, + cast<AtomicSDNode>(Node)->getMemOperand(), + cast<AtomicSDNode>(Node)->getOrdering(), + cast<AtomicSDNode>(Node)->getSynchScope()); +} + +static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { + SDNode *Node = Op.getNode(); + SDLoc dl(Node); + EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); + + // Convert seq_cst store -> xchg + // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) + // FIXME: On 32-bit, store -> fist or movq would be more efficient + // (The only way to get a 16-byte store is cmpxchg16b) + // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. + if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent || + !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { + SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, + cast<AtomicSDNode>(Node)->getMemoryVT(), + Node->getOperand(0), + Node->getOperand(1), Node->getOperand(2), + cast<AtomicSDNode>(Node)->getMemOperand(), + cast<AtomicSDNode>(Node)->getOrdering(), + cast<AtomicSDNode>(Node)->getSynchScope()); + return Swap.getValue(1); + } + // Other atomic stores have a simple pattern. + return Op; +} + +static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getNode()->getSimpleValueType(0); + + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + SDVTList VTs = DAG.getVTList(VT, MVT::i32); + + unsigned Opc; + bool ExtraOp = false; + switch (Op.getOpcode()) { + default: llvm_unreachable("Invalid code"); + case ISD::ADDC: Opc = X86ISD::ADD; break; + case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; + case ISD::SUBC: Opc = X86ISD::SUB; break; + case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; + } + + if (!ExtraOp) + return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), + Op.getOperand(1)); + return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), + Op.getOperand(1), Op.getOperand(2)); +} + +static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit()); + + // For MacOSX, we want to call an alternative entry point: __sincos_stret, + // which returns the values as { float, float } (in XMM0) or + // { double, double } (which is returned in XMM0, XMM1). + SDLoc dl(Op); + SDValue Arg = Op.getOperand(0); + EVT ArgVT = Arg.getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + + Entry.Node = Arg; + Entry.Ty = ArgTy; + Entry.isSExt = false; + Entry.isZExt = false; + Args.push_back(Entry); + + bool isF64 = ArgVT == MVT::f64; + // Only optimize x86_64 for now. i386 is a bit messy. For f32, + // the small struct {f32, f32} is returned in (eax, edx). For f64, + // the results are returned via SRet in memory. + const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret"; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Callee = + DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); + + Type *RetTy = isF64 + ? (Type*)StructType::get(ArgTy, ArgTy, nullptr) + : (Type*)VectorType::get(ArgTy, 4); + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) + .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0); + + std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); + + if (isF64) + // Returned in xmm0 and xmm1. + return CallResult.first; + + // Returned in bits 0:31 and 32:64 xmm0. + SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, + CallResult.first, DAG.getIntPtrConstant(0, dl)); + SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, + CallResult.first, DAG.getIntPtrConstant(1, dl)); + SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); + return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); +} + +/// Widen a vector input to a vector of NVT. The +/// input vector must have the same element type as NVT. +static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, + bool FillWithZeroes = false) { + // Check if InOp already has the right width. + MVT InVT = InOp.getSimpleValueType(); + if (InVT == NVT) + return InOp; + + if (InOp.isUndef()) + return DAG.getUNDEF(NVT); + + assert(InVT.getVectorElementType() == NVT.getVectorElementType() && + "input and widen element type must match"); + + unsigned InNumElts = InVT.getVectorNumElements(); + unsigned WidenNumElts = NVT.getVectorNumElements(); + assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && + "Unexpected request for vector widening"); + + EVT EltVT = NVT.getVectorElementType(); + + SDLoc dl(InOp); + if (InOp.getOpcode() == ISD::CONCAT_VECTORS && + InOp.getNumOperands() == 2) { + SDValue N1 = InOp.getOperand(1); + if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) || + N1.isUndef()) { + InOp = InOp.getOperand(0); + InVT = InOp.getSimpleValueType(); + InNumElts = InVT.getVectorNumElements(); + } + } + if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) || + ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) { + SmallVector<SDValue, 16> Ops; + for (unsigned i = 0; i < InNumElts; ++i) + Ops.push_back(InOp.getOperand(i)); + + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : + DAG.getUNDEF(EltVT); + for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) + Ops.push_back(FillVal); + return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops); + } + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) : + DAG.getUNDEF(NVT); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, + InOp, DAG.getIntPtrConstant(0, dl)); +} + +static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(Subtarget->hasAVX512() && + "MGATHER/MSCATTER are supported on AVX-512 arch only"); + + // X86 scatter kills mask register, so its type should be added to + // the list of return values. + // If the "scatter" has 2 return values, it is already handled. + if (Op.getNode()->getNumValues() == 2) + return Op; + + MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode()); + SDValue Src = N->getValue(); + MVT VT = Src.getSimpleValueType(); + assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"); + SDLoc dl(Op); + + SDValue NewScatter; + SDValue Index = N->getIndex(); + SDValue Mask = N->getMask(); + SDValue Chain = N->getChain(); + SDValue BasePtr = N->getBasePtr(); + MVT MemVT = N->getMemoryVT().getSimpleVT(); + MVT IndexVT = Index.getSimpleValueType(); + MVT MaskVT = Mask.getSimpleValueType(); + + if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) { + // The v2i32 value was promoted to v2i64. + // Now we "redo" the type legalizer's work and widen the original + // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64 + // with a shuffle. + assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) && + "Unexpected memory type"); + int ShuffleMask[] = {0, 2, -1, -1}; + Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src), + DAG.getUNDEF(MVT::v4i32), ShuffleMask); + // Now we have 4 elements instead of 2. + // Expand the index. + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4); + Index = ExtendToType(Index, NewIndexVT, DAG); + + // Expand the mask with zeroes + // Mask may be <2 x i64> or <2 x i1> at this moment + assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) && + "Unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4); + Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); + VT = MVT::v4i32; + } + + unsigned NumElts = VT.getVectorNumElements(); + if (!Subtarget->hasVLX() && !VT.is512BitVector() && + !Index.getSimpleValueType().is512BitVector()) { + // AVX512F supports only 512-bit vectors. Or data or index should + // be 512 bit wide. If now the both index and data are 256-bit, but + // the vector contains 8 elements, we just sign-extend the index + if (IndexVT == MVT::v8i32) + // Just extend index + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + else { + // The minimal number of elts in scatter is 8 + NumElts = 8; + // Index + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); + // Use original index here, do not modify the index twice + Index = ExtendToType(N->getIndex(), NewIndexVT, DAG); + if (IndexVT.getScalarType() == MVT::i32) + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + + // Mask + // At this point we have promoted mask operand + assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); + // Use the original mask here, do not modify the mask twice + Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true); + + // The value that should be stored + MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); + Src = ExtendToType(Src, NewVT, DAG); + } + } + // If the mask is "wide" at this point - truncate it to i1 vector + MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts); + Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask); + + // The mask is killed by scatter, add it to the values + SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other); + SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index}; + NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); + return SDValue(NewScatter.getNode(), 0); +} + +static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + + MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); + MVT VT = Op.getSimpleValueType(); + SDValue Mask = N->getMask(); + SDLoc dl(Op); + + if (Subtarget->hasAVX512() && !Subtarget->hasVLX() && + !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) { + // This operation is legal for targets with VLX, but without + // VLX the vector should be widened to 512 bit + unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); + MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec); + MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); + SDValue Src0 = N->getSrc0(); + Src0 = ExtendToType(Src0, WideDataVT, DAG); + Mask = ExtendToType(Mask, WideMaskVT, DAG, true); + SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(), + N->getBasePtr(), Mask, Src0, + N->getMemoryVT(), N->getMemOperand(), + N->getExtensionType()); + + SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + NewLoad.getValue(0), + DAG.getIntPtrConstant(0, dl)); + SDValue RetOps[] = {Exract, NewLoad.getValue(1)}; + return DAG.getMergeValues(RetOps, dl); + } + return Op; +} + +static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode()); + SDValue DataToStore = N->getValue(); + MVT VT = DataToStore.getSimpleValueType(); + SDValue Mask = N->getMask(); + SDLoc dl(Op); + + if (Subtarget->hasAVX512() && !Subtarget->hasVLX() && + !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) { + // This operation is legal for targets with VLX, but without + // VLX the vector should be widened to 512 bit + unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); + MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec); + MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); + DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); + Mask = ExtendToType(Mask, WideMaskVT, DAG, true); + return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), + Mask, N->getMemoryVT(), N->getMemOperand(), + N->isTruncatingStore()); + } + return Op; +} + +static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(Subtarget->hasAVX512() && + "MGATHER/MSCATTER are supported on AVX-512 arch only"); + + MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode()); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + SDValue Index = N->getIndex(); + SDValue Mask = N->getMask(); + SDValue Src0 = N->getValue(); + MVT IndexVT = Index.getSimpleValueType(); + MVT MaskVT = Mask.getSimpleValueType(); + + unsigned NumElts = VT.getVectorNumElements(); + assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); + + if (!Subtarget->hasVLX() && !VT.is512BitVector() && + !Index.getSimpleValueType().is512BitVector()) { + // AVX512F supports only 512-bit vectors. Or data or index should + // be 512 bit wide. If now the both index and data are 256-bit, but + // the vector contains 8 elements, we just sign-extend the index + if (NumElts == 8) { + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), Index }; + DAG.UpdateNodeOperands(N, Ops); + return Op; + } + + // Minimal number of elements in Gather + NumElts = 8; + // Index + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); + Index = ExtendToType(Index, NewIndexVT, DAG); + if (IndexVT.getScalarType() == MVT::i32) + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + + // Mask + MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts); + // At this point we have promoted mask operand + assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); + Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); + Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask); + + // The pass-thru value + MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); + Src0 = ExtendToType(Src0, NewVT, DAG); + + SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; + SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other), + N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + NewGather.getValue(0), + DAG.getIntPtrConstant(0, dl)); + SDValue RetOps[] = {Exract, NewGather.getValue(1)}; + return DAG.getMergeValues(RetOps, dl); + } + return Op; +} + +SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op, + SelectionDAG &DAG) const { + // TODO: Eventually, the lowering of these nodes should be informed by or + // deferred to the GC strategy for the function in which they appear. For + // now, however, they must be lowered to something. Since they are logically + // no-ops in the case of a null GC strategy (or a GC strategy which does not + // require special handling for these nodes), lower them as literal NOOPs for + // the time being. + SmallVector<SDValue, 2> Ops; + + Ops.push_back(Op.getOperand(0)); + if (Op->getGluedNode()) + Ops.push_back(Op->getOperand(Op->getNumOperands() - 1)); + + SDLoc OpDL(Op); + SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); + + return NOOP; +} + +SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op, + SelectionDAG &DAG) const { + // TODO: Eventually, the lowering of these nodes should be informed by or + // deferred to the GC strategy for the function in which they appear. For + // now, however, they must be lowered to something. Since they are logically + // no-ops in the case of a null GC strategy (or a GC strategy which does not + // require special handling for these nodes), lower them as literal NOOPs for + // the time being. + SmallVector<SDValue, 2> Ops; + + Ops.push_back(Op.getOperand(0)); + if (Op->getGluedNode()) + Ops.push_back(Op->getOperand(Op->getNumOperands() - 1)); + + SDLoc OpDL(Op); + SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); + + return NOOP; +} + +/// LowerOperation - Provide custom lowering hooks for some operations. +/// +SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: llvm_unreachable("Should not custom lower this!"); + case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); + case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: + return LowerCMP_SWAP(Op, Subtarget, DAG); + case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG); + case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); + case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); + case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); + case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG); + case ISD::VSELECT: return LowerVSELECT(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); + case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); + case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); + case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); + case ISD::ConstantPool: return LowerConstantPool(Op, DAG); + case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); + case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); + case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); + case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); + case ISD::SHL_PARTS: + case ISD::SRA_PARTS: + case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); + case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); + case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); + case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); + case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); + case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); + case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); + case ISD::SIGN_EXTEND_VECTOR_INREG: + return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG); + case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); + case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); + case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG); + case ISD::FABS: + case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); + case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); + case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); + case ISD::SETCC: return LowerSETCC(Op, DAG); + case ISD::SETCCE: return LowerSETCCE(Op, DAG); + case ISD::SELECT: return LowerSELECT(Op, DAG); + case ISD::BRCOND: return LowerBRCOND(Op, DAG); + case ISD::JumpTable: return LowerJumpTable(Op, DAG); + case ISD::VASTART: return LowerVASTART(Op, DAG); + case ISD::VAARG: return LowerVAARG(Op, DAG); + case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG); + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); + case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); + case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); + case ISD::FRAME_TO_ARGS_OFFSET: + return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); + case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); + case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); + case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); + case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); + case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); + case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); + case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); + case ISD::CTLZ: return LowerCTLZ(Op, Subtarget, DAG); + case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, Subtarget, DAG); + case ISD::CTTZ: + case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG); + case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); + case ISD::UMUL_LOHI: + case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); + case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG); + case ISD::SRA: + case ISD::SRL: + case ISD::SHL: return LowerShift(Op, Subtarget, DAG); + case ISD::SADDO: + case ISD::UADDO: + case ISD::SSUBO: + case ISD::USUBO: + case ISD::SMULO: + case ISD::UMULO: return LowerXALUO(Op, DAG); + case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); + case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); + case ISD::ADDC: + case ISD::ADDE: + case ISD::SUBC: + case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); + case ISD::ADD: return LowerADD(Op, DAG); + case ISD::SUB: return LowerSUB(Op, DAG); + case ISD::SMAX: + case ISD::SMIN: + case ISD::UMAX: + case ISD::UMIN: return LowerMINMAX(Op, DAG); + case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); + case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); + case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); + case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); + case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG); + case ISD::GC_TRANSITION_START: + return LowerGC_TRANSITION_START(Op, DAG); + case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG); + } +} + +/// ReplaceNodeResults - Replace a node with an illegal result type +/// with a new node built out of custom code. +void X86TargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl<SDValue>&Results, + SelectionDAG &DAG) const { + SDLoc dl(N); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + switch (N->getOpcode()) { + default: + llvm_unreachable("Do not know how to custom type legalize this operation!"); + case X86ISD::AVG: { + // Legalize types for X86ISD::AVG by expanding vectors. + assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + + auto InVT = N->getValueType(0); + auto InVTSize = InVT.getSizeInBits(); + const unsigned RegSize = + (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128; + assert((!Subtarget->hasAVX512() || RegSize < 512) && + "512-bit vector requires AVX512"); + assert((!Subtarget->hasAVX2() || RegSize < 256) && + "256-bit vector requires AVX2"); + + auto ElemVT = InVT.getVectorElementType(); + auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, + RegSize / ElemVT.getSizeInBits()); + assert(RegSize % InVT.getSizeInBits() == 0); + unsigned NumConcat = RegSize / InVT.getSizeInBits(); + + SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT)); + Ops[0] = N->getOperand(0); + SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + Ops[0] = N->getOperand(1); + SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + + SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1); + Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res, + DAG.getIntPtrConstant(0, dl))); + return; + } + // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. + case X86ISD::FMINC: + case X86ISD::FMIN: + case X86ISD::FMAXC: + case X86ISD::FMAX: { + EVT VT = N->getValueType(0); + assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."); + SDValue UNDEF = DAG.getUNDEF(VT); + SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, + N->getOperand(0), UNDEF); + SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, + N->getOperand(1), UNDEF); + Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS)); + return; + } + case ISD::SIGN_EXTEND_INREG: + case ISD::ADDC: + case ISD::ADDE: + case ISD::SUBC: + case ISD::SUBE: + // We don't want to expand or promote these. + return; + case ISD::SDIV: + case ISD::UDIV: + case ISD::SREM: + case ISD::UREM: + case ISD::SDIVREM: + case ISD::UDIVREM: { + SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG); + Results.push_back(V); + return; + } + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: { + bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; + + std::pair<SDValue,SDValue> Vals = + FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); + SDValue FIST = Vals.first, StackSlot = Vals.second; + if (FIST.getNode()) { + EVT VT = N->getValueType(0); + // Return a load from the stack slot. + if (StackSlot.getNode()) + Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, + MachinePointerInfo(), + false, false, false, 0)); + else + Results.push_back(FIST); + } + return; + } + case ISD::UINT_TO_FP: { + assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + if (N->getOperand(0).getValueType() != MVT::v2i32 || + N->getValueType(0) != MVT::v2f32) + return; + SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, + N->getOperand(0)); + SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, + MVT::f64); + SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias); + SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, + DAG.getBitcast(MVT::v2i64, VBias)); + Or = DAG.getBitcast(MVT::v2f64, Or); + // TODO: Are there any fast-math-flags to propagate here? + SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); + Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); + return; + } + case ISD::FP_ROUND: { + if (!TLI.isTypeLegal(N->getOperand(0).getValueType())) + return; + SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); + Results.push_back(V); + return; + } + case ISD::FP_EXTEND: { + // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND. + // No other ValueType for FP_EXTEND should reach this point. + assert(N->getValueType(0) == MVT::v2f32 && + "Do not know how to legalize this Node"); + return; + } + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default : llvm_unreachable("Do not know how to custom type " + "legalize this intrinsic operation!"); + case Intrinsic::x86_rdtsc: + return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, + Results); + case Intrinsic::x86_rdtscp: + return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget, + Results); + case Intrinsic::x86_rdpmc: + return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); + } + } + case ISD::INTRINSIC_WO_CHAIN: { + if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG)) + Results.push_back(V); + return; + } + case ISD::READCYCLECOUNTER: { + return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, + Results); + } + case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { + EVT T = N->getValueType(0); + assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); + bool Regs64bit = T == MVT::i128; + MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; + SDValue cpInL, cpInH; + cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), + DAG.getConstant(0, dl, HalfT)); + cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), + DAG.getConstant(1, dl, HalfT)); + cpInL = DAG.getCopyToReg(N->getOperand(0), dl, + Regs64bit ? X86::RAX : X86::EAX, + cpInL, SDValue()); + cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, + Regs64bit ? X86::RDX : X86::EDX, + cpInH, cpInL.getValue(1)); + SDValue swapInL, swapInH; + swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), + DAG.getConstant(0, dl, HalfT)); + swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), + DAG.getConstant(1, dl, HalfT)); + swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, + Regs64bit ? X86::RBX : X86::EBX, + swapInL, cpInH.getValue(1)); + swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, + Regs64bit ? X86::RCX : X86::ECX, + swapInH, swapInL.getValue(1)); + SDValue Ops[] = { swapInH.getValue(0), + N->getOperand(1), + swapInH.getValue(1) }; + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); + unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : + X86ISD::LCMPXCHG8_DAG; + SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO); + SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, + Regs64bit ? X86::RAX : X86::EAX, + HalfT, Result.getValue(1)); + SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, + Regs64bit ? X86::RDX : X86::EDX, + HalfT, cpOutL.getValue(2)); + SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; + + SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS, + MVT::i32, cpOutH.getValue(2)); + SDValue Success = + DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_E, dl, MVT::i8), EFLAGS); + Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1)); + + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); + Results.push_back(Success); + Results.push_back(EFLAGS.getValue(1)); + return; + } + case ISD::ATOMIC_SWAP: + case ISD::ATOMIC_LOAD_ADD: + case ISD::ATOMIC_LOAD_SUB: + case ISD::ATOMIC_LOAD_AND: + case ISD::ATOMIC_LOAD_OR: + case ISD::ATOMIC_LOAD_XOR: + case ISD::ATOMIC_LOAD_NAND: + case ISD::ATOMIC_LOAD_MIN: + case ISD::ATOMIC_LOAD_MAX: + case ISD::ATOMIC_LOAD_UMIN: + case ISD::ATOMIC_LOAD_UMAX: + case ISD::ATOMIC_LOAD: { + // Delegate to generic TypeLegalization. Situations we can really handle + // should have already been dealt with by AtomicExpandPass.cpp. + break; + } + case ISD::BITCAST: { + assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + EVT DstVT = N->getValueType(0); + EVT SrcVT = N->getOperand(0)->getValueType(0); + + if (SrcVT != MVT::f64 || + (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8)) + return; + + unsigned NumElts = DstVT.getVectorNumElements(); + EVT SVT = DstVT.getVectorElementType(); + EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); + SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + MVT::v2f64, N->getOperand(0)); + SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded); + + if (ExperimentalVectorWideningLegalization) { + // If we are legalizing vectors by widening, we already have the desired + // legal vector type, just return it. + Results.push_back(ToVecInt); + return; + } + + SmallVector<SDValue, 8> Elts; + for (unsigned i = 0, e = NumElts; i != e; ++i) + Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, + ToVecInt, DAG.getIntPtrConstant(i, dl))); + + Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts)); + } + } +} + +const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { + switch ((X86ISD::NodeType)Opcode) { + case X86ISD::FIRST_NUMBER: break; + case X86ISD::BSF: return "X86ISD::BSF"; + case X86ISD::BSR: return "X86ISD::BSR"; + case X86ISD::SHLD: return "X86ISD::SHLD"; + case X86ISD::SHRD: return "X86ISD::SHRD"; + case X86ISD::FAND: return "X86ISD::FAND"; + case X86ISD::FANDN: return "X86ISD::FANDN"; + case X86ISD::FOR: return "X86ISD::FOR"; + case X86ISD::FXOR: return "X86ISD::FXOR"; + case X86ISD::FILD: return "X86ISD::FILD"; + case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; + case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; + case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; + case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; + case X86ISD::FLD: return "X86ISD::FLD"; + case X86ISD::FST: return "X86ISD::FST"; + case X86ISD::CALL: return "X86ISD::CALL"; + case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; + case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG"; + case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG"; + case X86ISD::BT: return "X86ISD::BT"; + case X86ISD::CMP: return "X86ISD::CMP"; + case X86ISD::COMI: return "X86ISD::COMI"; + case X86ISD::UCOMI: return "X86ISD::UCOMI"; + case X86ISD::CMPM: return "X86ISD::CMPM"; + case X86ISD::CMPMU: return "X86ISD::CMPMU"; + case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND"; + case X86ISD::SETCC: return "X86ISD::SETCC"; + case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; + case X86ISD::FSETCC: return "X86ISD::FSETCC"; + case X86ISD::FGETSIGNx86: return "X86ISD::FGETSIGNx86"; + case X86ISD::CMOV: return "X86ISD::CMOV"; + case X86ISD::BRCOND: return "X86ISD::BRCOND"; + case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; + case X86ISD::IRET: return "X86ISD::IRET"; + case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; + case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; + case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; + case X86ISD::Wrapper: return "X86ISD::Wrapper"; + case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; + case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q"; + case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W"; + case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D"; + case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; + case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; + case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; + case X86ISD::PINSRB: return "X86ISD::PINSRB"; + case X86ISD::PINSRW: return "X86ISD::PINSRW"; + case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; + case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; + case X86ISD::ANDNP: return "X86ISD::ANDNP"; + case X86ISD::PSIGN: return "X86ISD::PSIGN"; + case X86ISD::BLENDI: return "X86ISD::BLENDI"; + case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND"; + case X86ISD::ADDUS: return "X86ISD::ADDUS"; + case X86ISD::SUBUS: return "X86ISD::SUBUS"; + case X86ISD::HADD: return "X86ISD::HADD"; + case X86ISD::HSUB: return "X86ISD::HSUB"; + case X86ISD::FHADD: return "X86ISD::FHADD"; + case X86ISD::FHSUB: return "X86ISD::FHSUB"; + case X86ISD::ABS: return "X86ISD::ABS"; + case X86ISD::CONFLICT: return "X86ISD::CONFLICT"; + case X86ISD::FMAX: return "X86ISD::FMAX"; + case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND"; + case X86ISD::FMIN: return "X86ISD::FMIN"; + case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND"; + case X86ISD::FMAXC: return "X86ISD::FMAXC"; + case X86ISD::FMINC: return "X86ISD::FMINC"; + case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; + case X86ISD::FRCP: return "X86ISD::FRCP"; + case X86ISD::EXTRQI: return "X86ISD::EXTRQI"; + case X86ISD::INSERTQI: return "X86ISD::INSERTQI"; + case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; + case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; + case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; + case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP"; + case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP"; + case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; + case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; + case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; + case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; + case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; + case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; + case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG"; + case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; + case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; + case X86ISD::VZEXT: return "X86ISD::VZEXT"; + case X86ISD::VSEXT: return "X86ISD::VSEXT"; + case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; + case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; + case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; + case X86ISD::VINSERT: return "X86ISD::VINSERT"; + case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; + case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; + case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD"; + case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD"; + case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK"; + case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; + case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; + case X86ISD::VSHL: return "X86ISD::VSHL"; + case X86ISD::VSRL: return "X86ISD::VSRL"; + case X86ISD::VSRA: return "X86ISD::VSRA"; + case X86ISD::VSHLI: return "X86ISD::VSHLI"; + case X86ISD::VSRLI: return "X86ISD::VSRLI"; + case X86ISD::VSRAI: return "X86ISD::VSRAI"; + case X86ISD::CMPP: return "X86ISD::CMPP"; + case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; + case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; + case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM"; + case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM"; + case X86ISD::ADD: return "X86ISD::ADD"; + case X86ISD::SUB: return "X86ISD::SUB"; + case X86ISD::ADC: return "X86ISD::ADC"; + case X86ISD::SBB: return "X86ISD::SBB"; + case X86ISD::SMUL: return "X86ISD::SMUL"; + case X86ISD::UMUL: return "X86ISD::UMUL"; + case X86ISD::SMUL8: return "X86ISD::SMUL8"; + case X86ISD::UMUL8: return "X86ISD::UMUL8"; + case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG"; + case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG"; + case X86ISD::INC: return "X86ISD::INC"; + case X86ISD::DEC: return "X86ISD::DEC"; + case X86ISD::OR: return "X86ISD::OR"; + case X86ISD::XOR: return "X86ISD::XOR"; + case X86ISD::AND: return "X86ISD::AND"; + case X86ISD::BEXTR: return "X86ISD::BEXTR"; + case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; + case X86ISD::PTEST: return "X86ISD::PTEST"; + case X86ISD::TESTP: return "X86ISD::TESTP"; + case X86ISD::TESTM: return "X86ISD::TESTM"; + case X86ISD::TESTNM: return "X86ISD::TESTNM"; + case X86ISD::KORTEST: return "X86ISD::KORTEST"; + case X86ISD::KTEST: return "X86ISD::KTEST"; + case X86ISD::PACKSS: return "X86ISD::PACKSS"; + case X86ISD::PACKUS: return "X86ISD::PACKUS"; + case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; + case X86ISD::VALIGN: return "X86ISD::VALIGN"; + case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; + case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; + case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; + case X86ISD::SHUFP: return "X86ISD::SHUFP"; + case X86ISD::SHUF128: return "X86ISD::SHUF128"; + case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; + case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; + case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; + case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; + case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; + case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; + case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; + case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; + case X86ISD::MOVSD: return "X86ISD::MOVSD"; + case X86ISD::MOVSS: return "X86ISD::MOVSS"; + case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; + case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; + case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; + case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; + case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST"; + case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; + case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV"; + case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI"; + case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; + case X86ISD::VPERMV: return "X86ISD::VPERMV"; + case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; + case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3"; + case X86ISD::VPERMI: return "X86ISD::VPERMI"; + case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG"; + case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM"; + case X86ISD::VRANGE: return "X86ISD::VRANGE"; + case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; + case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; + case X86ISD::PSADBW: return "X86ISD::PSADBW"; + case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW"; + case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; + case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; + case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; + case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; + case X86ISD::MFENCE: return "X86ISD::MFENCE"; + case X86ISD::SFENCE: return "X86ISD::SFENCE"; + case X86ISD::LFENCE: return "X86ISD::LFENCE"; + case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; + case X86ISD::SAHF: return "X86ISD::SAHF"; + case X86ISD::RDRAND: return "X86ISD::RDRAND"; + case X86ISD::RDSEED: return "X86ISD::RDSEED"; + case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW"; + case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD"; + case X86ISD::VPROT: return "X86ISD::VPROT"; + case X86ISD::VPROTI: return "X86ISD::VPROTI"; + case X86ISD::VPSHA: return "X86ISD::VPSHA"; + case X86ISD::VPSHL: return "X86ISD::VPSHL"; + case X86ISD::VPCOM: return "X86ISD::VPCOM"; + case X86ISD::VPCOMU: return "X86ISD::VPCOMU"; + case X86ISD::FMADD: return "X86ISD::FMADD"; + case X86ISD::FMSUB: return "X86ISD::FMSUB"; + case X86ISD::FNMADD: return "X86ISD::FNMADD"; + case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; + case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; + case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; + case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND"; + case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND"; + case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND"; + case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND"; + case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND"; + case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND"; + case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; + case X86ISD::VREDUCE: return "X86ISD::VREDUCE"; + case X86ISD::VGETMANT: return "X86ISD::VGETMANT"; + case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; + case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; + case X86ISD::XTEST: return "X86ISD::XTEST"; + case X86ISD::COMPRESS: return "X86ISD::COMPRESS"; + case X86ISD::EXPAND: return "X86ISD::EXPAND"; + case X86ISD::SELECT: return "X86ISD::SELECT"; + case X86ISD::ADDSUB: return "X86ISD::ADDSUB"; + case X86ISD::RCP28: return "X86ISD::RCP28"; + case X86ISD::EXP2: return "X86ISD::EXP2"; + case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; + case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; + case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND"; + case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND"; + case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; + case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND"; + case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND"; + case X86ISD::SCALEF: return "X86ISD::SCALEF"; + case X86ISD::ADDS: return "X86ISD::ADDS"; + case X86ISD::SUBS: return "X86ISD::SUBS"; + case X86ISD::AVG: return "X86ISD::AVG"; + case X86ISD::MULHRS: return "X86ISD::MULHRS"; + case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND"; + case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND"; + case X86ISD::FP_TO_SINT_RND: return "X86ISD::FP_TO_SINT_RND"; + case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND"; + case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS"; + case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS"; + } + return nullptr; +} + +// isLegalAddressingMode - Return true if the addressing mode represented +// by AM is legal for this target, for a load/store of the specified type. +bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, Type *Ty, + unsigned AS) const { + // X86 supports extremely general addressing modes. + CodeModel::Model M = getTargetMachine().getCodeModel(); + Reloc::Model R = getTargetMachine().getRelocationModel(); + + // X86 allows a sign-extended 32-bit immediate field as a displacement. + if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr)) + return false; + + if (AM.BaseGV) { + unsigned GVFlags = + Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); + + // If a reference to this global requires an extra load, we can't fold it. + if (isGlobalStubReference(GVFlags)) + return false; + + // If BaseGV requires a register for the PIC base, we cannot also have a + // BaseReg specified. + if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) + return false; + + // If lower 4G is not available, then we must use rip-relative addressing. + if ((M != CodeModel::Small || R != Reloc::Static) && + Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) + return false; + } + + switch (AM.Scale) { + case 0: + case 1: + case 2: + case 4: + case 8: + // These scales always work. + break; + case 3: + case 5: + case 9: + // These scales are formed with basereg+scalereg. Only accept if there is + // no basereg yet. + if (AM.HasBaseReg) + return false; + break; + default: // Other stuff never works. + return false; + } + + return true; +} + +bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { + unsigned Bits = Ty->getScalarSizeInBits(); + + // 8-bit shifts are always expensive, but versions with a scalar amount aren't + // particularly cheaper than those without. + if (Bits == 8) + return false; + + // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make + // variable shifts just as cheap as scalar ones. + if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64)) + return false; + + // Otherwise, it's significantly cheaper to shift by a scalar amount than by a + // fully general vector. + return true; +} + +bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { + if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) + return false; + unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); + unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); + return NumBits1 > NumBits2; +} + +bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { + if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) + return false; + + if (!isTypeLegal(EVT::getEVT(Ty1))) + return false; + + assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); + + // Assuming the caller doesn't have a zeroext or signext return parameter, + // truncation all the way down to i1 is valid. + return true; +} + +bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { + return isInt<32>(Imm); +} + +bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { + // Can also use sub to handle negated immediates. + return isInt<32>(Imm); +} + +bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { + if (!VT1.isInteger() || !VT2.isInteger()) + return false; + unsigned NumBits1 = VT1.getSizeInBits(); + unsigned NumBits2 = VT2.getSizeInBits(); + return NumBits1 > NumBits2; +} + +bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { + // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. + return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); +} + +bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { + // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. + return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); +} + +bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { + EVT VT1 = Val.getValueType(); + if (isZExtFree(VT1, VT2)) + return true; + + if (Val.getOpcode() != ISD::LOAD) + return false; + + if (!VT1.isSimple() || !VT1.isInteger() || + !VT2.isSimple() || !VT2.isInteger()) + return false; + + switch (VT1.getSimpleVT().SimpleTy) { + default: break; + case MVT::i8: + case MVT::i16: + case MVT::i32: + // X86 has 8, 16, and 32-bit zero-extending loads. + return true; + } + + return false; +} + +bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } + +bool +X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + if (!Subtarget->hasAnyFMA()) + return false; + + VT = VT.getScalarType(); + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f32: + case MVT::f64: + return true; + default: + break; + } + + return false; +} + +bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { + // i16 instructions are longer (0x66 prefix) and potentially slower. + return !(VT1 == MVT::i32 && VT2 == MVT::i16); +} + +/// isShuffleMaskLegal - Targets can use this to indicate that they only +/// support *some* VECTOR_SHUFFLE operations, those with specific masks. +/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values +/// are assumed to be legal. +bool +X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, + EVT VT) const { + if (!VT.isSimple()) + return false; + + // Not for i1 vectors + if (VT.getSimpleVT().getScalarType() == MVT::i1) + return false; + + // Very little shuffling can be done for 64-bit vectors right now. + if (VT.getSimpleVT().getSizeInBits() == 64) + return false; + + // We only care that the types being shuffled are legal. The lowering can + // handle any possible shuffle mask that results. + return isTypeLegal(VT.getSimpleVT()); +} + +bool +X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, + EVT VT) const { + // Just delegate to the generic legality, clear masks aren't special. + return isShuffleMaskLegal(Mask, VT); +} + +//===----------------------------------------------------------------------===// +// X86 Scheduler Hooks +//===----------------------------------------------------------------------===// + +/// Utility function to emit xbegin specifying the start of an RTM region. +static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, + const TargetInstrInfo *TII) { + DebugLoc DL = MI->getDebugLoc(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = ++MBB->getIterator(); + + // For the v = xbegin(), we generate + // + // thisMBB: + // xbegin sinkMBB + // + // mainMBB: + // eax = -1 + // + // sinkMBB: + // v = eax + + MachineBasicBlock *thisMBB = MBB; + MachineFunction *MF = MBB->getParent(); + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // thisMBB: + // xbegin sinkMBB + // # fallthrough to mainMBB + // # abortion to sinkMBB + BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); + thisMBB->addSuccessor(mainMBB); + thisMBB->addSuccessor(sinkMBB); + + // mainMBB: + // EAX = -1 + BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); + mainMBB->addSuccessor(sinkMBB); + + // sinkMBB: + // EAX is live into the sinkMBB + sinkMBB->addLiveIn(X86::EAX); + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addReg(X86::EAX); + + MI->eraseFromParent(); + return sinkMBB; +} + +// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 +// or XMM0_V32I8 in AVX all of this code can be replaced with that +// in the .td file. +static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB, + const TargetInstrInfo *TII) { + unsigned Opc; + switch (MI->getOpcode()) { + default: llvm_unreachable("illegal opcode!"); + case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break; + case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break; + case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break; + case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break; + case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break; + case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break; + case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break; + case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break; + } + + DebugLoc dl = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); + + unsigned NumArgs = MI->getNumOperands(); + for (unsigned i = 1; i < NumArgs; ++i) { + MachineOperand &Op = MI->getOperand(i); + if (!(Op.isReg() && Op.isImplicit())) + MIB.addOperand(Op); + } + if (MI->hasOneMemOperand()) + MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + + BuildMI(*BB, MI, dl, + TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addReg(X86::XMM0); + + MI->eraseFromParent(); + return BB; +} + +// FIXME: Custom handling because TableGen doesn't support multiple implicit +// defs in an instruction pattern +static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, + const TargetInstrInfo *TII) { + unsigned Opc; + switch (MI->getOpcode()) { + default: llvm_unreachable("illegal opcode!"); + case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break; + case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break; + case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break; + case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break; + case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break; + case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break; + case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break; + case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break; + } + + DebugLoc dl = MI->getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); + + unsigned NumArgs = MI->getNumOperands(); // remove the results + for (unsigned i = 1; i < NumArgs; ++i) { + MachineOperand &Op = MI->getOperand(i); + if (!(Op.isReg() && Op.isImplicit())) + MIB.addOperand(Op); + } + if (MI->hasOneMemOperand()) + MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + + BuildMI(*BB, MI, dl, + TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addReg(X86::ECX); + + MI->eraseFromParent(); + return BB; +} + +static MachineBasicBlock *EmitWRPKRU(MachineInstr *MI, MachineBasicBlock *BB, + const X86Subtarget *Subtarget) { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + + // insert input VAL into EAX + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) + .addReg(MI->getOperand(0).getReg()); + // insert zero to ECX + BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX) + .addReg(X86::ECX) + .addReg(X86::ECX); + // insert zero to EDX + BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::EDX) + .addReg(X86::EDX) + .addReg(X86::EDX); + // insert WRPKRU instruction + BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + +static MachineBasicBlock *EmitRDPKRU(MachineInstr *MI, MachineBasicBlock *BB, + const X86Subtarget *Subtarget) { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + + // insert zero to ECX + BuildMI(*BB, MI, dl, TII->get(X86::XOR32rr), X86::ECX) + .addReg(X86::ECX) + .addReg(X86::ECX); + // insert RDPKRU instruction + BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr)); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addReg(X86::EAX); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + +static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, + const X86Subtarget *Subtarget) { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + // Address into RAX/EAX, other two args into ECX, EDX. + unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; + unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); + for (int i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(i)); + + unsigned ValOps = X86::AddrNumOperands; + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) + .addReg(MI->getOperand(ValOps).getReg()); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) + .addReg(MI->getOperand(ValOps+1).getReg()); + + // The instruction doesn't actually take any operands though. + BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + +MachineBasicBlock * +X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI, + MachineBasicBlock *MBB) const { + // Emit va_arg instruction on X86-64. + + // Operands to this pseudo-instruction: + // 0 ) Output : destination address (reg) + // 1-5) Input : va_list address (addr, i64mem) + // 6 ) ArgSize : Size (in bytes) of vararg type + // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset + // 8 ) Align : Alignment of type + // 9 ) EFLAGS (implicit-def) + + assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); + static_assert(X86::AddrNumOperands == 5, + "VAARG_64 assumes 5 address operands"); + + unsigned DestReg = MI->getOperand(0).getReg(); + MachineOperand &Base = MI->getOperand(1); + MachineOperand &Scale = MI->getOperand(2); + MachineOperand &Index = MI->getOperand(3); + MachineOperand &Disp = MI->getOperand(4); + MachineOperand &Segment = MI->getOperand(5); + unsigned ArgSize = MI->getOperand(6).getImm(); + unsigned ArgMode = MI->getOperand(7).getImm(); + unsigned Align = MI->getOperand(8).getImm(); + + // Memory Reference + assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + // Machine Information + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); + const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); + DebugLoc DL = MI->getDebugLoc(); + + // struct va_list { + // i32 gp_offset + // i32 fp_offset + // i64 overflow_area (address) + // i64 reg_save_area (address) + // } + // sizeof(va_list) = 24 + // alignment(va_list) = 8 + + unsigned TotalNumIntRegs = 6; + unsigned TotalNumXMMRegs = 8; + bool UseGPOffset = (ArgMode == 1); + bool UseFPOffset = (ArgMode == 2); + unsigned MaxOffset = TotalNumIntRegs * 8 + + (UseFPOffset ? TotalNumXMMRegs * 16 : 0); + + /* Align ArgSize to a multiple of 8 */ + unsigned ArgSizeA8 = (ArgSize + 7) & ~7; + bool NeedsAlign = (Align > 8); + + MachineBasicBlock *thisMBB = MBB; + MachineBasicBlock *overflowMBB; + MachineBasicBlock *offsetMBB; + MachineBasicBlock *endMBB; + + unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB + unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB + unsigned OffsetReg = 0; + + if (!UseGPOffset && !UseFPOffset) { + // If we only pull from the overflow region, we don't create a branch. + // We don't need to alter control flow. + OffsetDestReg = 0; // unused + OverflowDestReg = DestReg; + + offsetMBB = nullptr; + overflowMBB = thisMBB; + endMBB = thisMBB; + } else { + // First emit code to check if gp_offset (or fp_offset) is below the bound. + // If so, pull the argument from reg_save_area. (branch to offsetMBB) + // If not, pull from overflow_area. (branch to overflowMBB) + // + // thisMBB + // | . + // | . + // offsetMBB overflowMBB + // | . + // | . + // endMBB + + // Registers for the PHI in endMBB + OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); + OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); + + const BasicBlock *LLVM_BB = MBB->getBasicBlock(); + MachineFunction *MF = MBB->getParent(); + overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); + offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); + endMBB = MF->CreateMachineBasicBlock(LLVM_BB); + + MachineFunction::iterator MBBIter = ++MBB->getIterator(); + + // Insert the new basic blocks + MF->insert(MBBIter, offsetMBB); + MF->insert(MBBIter, overflowMBB); + MF->insert(MBBIter, endMBB); + + // Transfer the remainder of MBB and its successor edges to endMBB. + endMBB->splice(endMBB->begin(), thisMBB, + std::next(MachineBasicBlock::iterator(MI)), thisMBB->end()); + endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); + + // Make offsetMBB and overflowMBB successors of thisMBB + thisMBB->addSuccessor(offsetMBB); + thisMBB->addSuccessor(overflowMBB); + + // endMBB is a successor of both offsetMBB and overflowMBB + offsetMBB->addSuccessor(endMBB); + overflowMBB->addSuccessor(endMBB); + + // Load the offset value into a register + OffsetReg = MRI.createVirtualRegister(OffsetRegClass); + BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, UseFPOffset ? 4 : 0) + .addOperand(Segment) + .setMemRefs(MMOBegin, MMOEnd); + + // Check if there is enough room left to pull this argument. + BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) + .addReg(OffsetReg) + .addImm(MaxOffset + 8 - ArgSizeA8); + + // Branch to "overflowMBB" if offset >= max + // Fall through to "offsetMBB" otherwise + BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) + .addMBB(overflowMBB); + } + + // In offsetMBB, emit code to use the reg_save_area. + if (offsetMBB) { + assert(OffsetReg != 0); + + // Read the reg_save_area address. + unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); + BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, 16) + .addOperand(Segment) + .setMemRefs(MMOBegin, MMOEnd); + + // Zero-extend the offset + unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); + BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) + .addImm(0) + .addReg(OffsetReg) + .addImm(X86::sub_32bit); + + // Add the offset to the reg_save_area to get the final address. + BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) + .addReg(OffsetReg64) + .addReg(RegSaveReg); + + // Compute the offset for the next argument + unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); + BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) + .addReg(OffsetReg) + .addImm(UseFPOffset ? 16 : 8); + + // Store it back into the va_list. + BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, UseFPOffset ? 4 : 0) + .addOperand(Segment) + .addReg(NextOffsetReg) + .setMemRefs(MMOBegin, MMOEnd); + + // Jump to endMBB + BuildMI(offsetMBB, DL, TII->get(X86::JMP_1)) + .addMBB(endMBB); + } + + // + // Emit code to use overflow area + // + + // Load the overflow_area address into a register. + unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); + BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, 8) + .addOperand(Segment) + .setMemRefs(MMOBegin, MMOEnd); + + // If we need to align it, do so. Otherwise, just copy the address + // to OverflowDestReg. + if (NeedsAlign) { + // Align the overflow address + assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); + unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); + + // aligned_addr = (addr + (align-1)) & ~(align-1) + BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) + .addReg(OverflowAddrReg) + .addImm(Align-1); + + BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) + .addReg(TmpReg) + .addImm(~(uint64_t)(Align-1)); + } else { + BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) + .addReg(OverflowAddrReg); + } + + // Compute the next overflow address after this argument. + // (the overflow address should be kept 8-byte aligned) + unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); + BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) + .addReg(OverflowDestReg) + .addImm(ArgSizeA8); + + // Store the new overflow address. + BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, 8) + .addOperand(Segment) + .addReg(NextAddrReg) + .setMemRefs(MMOBegin, MMOEnd); + + // If we branched, emit the PHI to the front of endMBB. + if (offsetMBB) { + BuildMI(*endMBB, endMBB->begin(), DL, + TII->get(X86::PHI), DestReg) + .addReg(OffsetDestReg).addMBB(offsetMBB) + .addReg(OverflowDestReg).addMBB(overflowMBB); + } + + // Erase the pseudo instruction + MI->eraseFromParent(); + + return endMBB; +} + +MachineBasicBlock * +X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( + MachineInstr *MI, + MachineBasicBlock *MBB) const { + // Emit code to save XMM registers to the stack. The ABI says that the + // number of registers to save is given in %al, so it's theoretically + // possible to do an indirect jump trick to avoid saving all of them, + // however this code takes a simpler approach and just executes all + // of the stores if %al is non-zero. It's less code, and it's probably + // easier on the hardware branch predictor, and stores aren't all that + // expensive anyway. + + // Create the new basic blocks. One block contains all the XMM stores, + // and one block is the final destination regardless of whether any + // stores were performed. + const BasicBlock *LLVM_BB = MBB->getBasicBlock(); + MachineFunction *F = MBB->getParent(); + MachineFunction::iterator MBBIter = ++MBB->getIterator(); + MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(MBBIter, XMMSaveMBB); + F->insert(MBBIter, EndMBB); + + // Transfer the remainder of MBB and its successor edges to EndMBB. + EndMBB->splice(EndMBB->begin(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + EndMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // The original block will now fall through to the XMM save block. + MBB->addSuccessor(XMMSaveMBB); + // The XMMSaveMBB will fall through to the end block. + XMMSaveMBB->addSuccessor(EndMBB); + + // Now add the instructions. + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + + unsigned CountReg = MI->getOperand(0).getReg(); + int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); + int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); + + if (!Subtarget->isCallingConvWin64(F->getFunction()->getCallingConv())) { + // If %al is 0, branch around the XMM save block. + BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); + BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB); + MBB->addSuccessor(EndMBB); + } + + // Make sure the last operand is EFLAGS, which gets clobbered by the branch + // that was just emitted, but clearly shouldn't be "saved". + assert((MI->getNumOperands() <= 3 || + !MI->getOperand(MI->getNumOperands() - 1).isReg() || + MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS) + && "Expected last argument to be EFLAGS"); + unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr; + // In the XMM save block, save all the XMM argument registers. + for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) { + int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; + MachineMemOperand *MMO = F->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset), + MachineMemOperand::MOStore, + /*Size=*/16, /*Align=*/16); + BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) + .addFrameIndex(RegSaveFrameIndex) + .addImm(/*Scale=*/1) + .addReg(/*IndexReg=*/0) + .addImm(/*Disp=*/Offset) + .addReg(/*Segment=*/0) + .addReg(MI->getOperand(i).getReg()) + .addMemOperand(MMO); + } + + MI->eraseFromParent(); // The pseudo instruction is gone now. + + return EndMBB; +} + +// The EFLAGS operand of SelectItr might be missing a kill marker +// because there were multiple uses of EFLAGS, and ISel didn't know +// which to mark. Figure out whether SelectItr should have had a +// kill marker, and set it if it should. Returns the correct kill +// marker value. +static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, + MachineBasicBlock* BB, + const TargetRegisterInfo* TRI) { + // Scan forward through BB for a use/def of EFLAGS. + MachineBasicBlock::iterator miI(std::next(SelectItr)); + for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { + const MachineInstr& mi = *miI; + if (mi.readsRegister(X86::EFLAGS)) + return false; + if (mi.definesRegister(X86::EFLAGS)) + break; // Should have kill-flag - update below. + } + + // If we hit the end of the block, check whether EFLAGS is live into a + // successor. + if (miI == BB->end()) { + for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), + sEnd = BB->succ_end(); + sItr != sEnd; ++sItr) { + MachineBasicBlock* succ = *sItr; + if (succ->isLiveIn(X86::EFLAGS)) + return false; + } + } + + // We found a def, or hit the end of the basic block and EFLAGS wasn't live + // out. SelectMI should have a kill flag on EFLAGS. + SelectItr->addRegisterKilled(X86::EFLAGS, TRI); + return true; +} + +// Return true if it is OK for this CMOV pseudo-opcode to be cascaded +// together with other CMOV pseudo-opcodes into a single basic-block with +// conditional jump around it. +static bool isCMOVPseudo(MachineInstr *MI) { + switch (MI->getOpcode()) { + case X86::CMOV_FR32: + case X86::CMOV_FR64: + case X86::CMOV_GR8: + case X86::CMOV_GR16: + case X86::CMOV_GR32: + case X86::CMOV_RFP32: + case X86::CMOV_RFP64: + case X86::CMOV_RFP80: + case X86::CMOV_V2F64: + case X86::CMOV_V2I64: + case X86::CMOV_V4F32: + case X86::CMOV_V4F64: + case X86::CMOV_V4I64: + case X86::CMOV_V16F32: + case X86::CMOV_V8F32: + case X86::CMOV_V8F64: + case X86::CMOV_V8I64: + case X86::CMOV_V8I1: + case X86::CMOV_V16I1: + case X86::CMOV_V32I1: + case X86::CMOV_V64I1: + return true; + + default: + return false; + } +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + + // To "insert" a SELECT_CC instruction, we actually have to insert the + // diamond control-flow pattern. The incoming instruction knows the + // destination vreg to set, the condition code register to branch on, the + // true/false values to select between, and a branch opcode to use. + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator It = ++BB->getIterator(); + + // thisMBB: + // ... + // TrueVal = ... + // cmpTY ccX, r1, r2 + // bCC copy1MBB + // fallthrough --> copy0MBB + MachineBasicBlock *thisMBB = BB; + MachineFunction *F = BB->getParent(); + + // This code lowers all pseudo-CMOV instructions. Generally it lowers these + // as described above, by inserting a BB, and then making a PHI at the join + // point to select the true and false operands of the CMOV in the PHI. + // + // The code also handles two different cases of multiple CMOV opcodes + // in a row. + // + // Case 1: + // In this case, there are multiple CMOVs in a row, all which are based on + // the same condition setting (or the exact opposite condition setting). + // In this case we can lower all the CMOVs using a single inserted BB, and + // then make a number of PHIs at the join point to model the CMOVs. The only + // trickiness here, is that in a case like: + // + // t2 = CMOV cond1 t1, f1 + // t3 = CMOV cond1 t2, f2 + // + // when rewriting this into PHIs, we have to perform some renaming on the + // temps since you cannot have a PHI operand refer to a PHI result earlier + // in the same block. The "simple" but wrong lowering would be: + // + // t2 = PHI t1(BB1), f1(BB2) + // t3 = PHI t2(BB1), f2(BB2) + // + // but clearly t2 is not defined in BB1, so that is incorrect. The proper + // renaming is to note that on the path through BB1, t2 is really just a + // copy of t1, and do that renaming, properly generating: + // + // t2 = PHI t1(BB1), f1(BB2) + // t3 = PHI t1(BB1), f2(BB2) + // + // Case 2, we lower cascaded CMOVs such as + // + // (CMOV (CMOV F, T, cc1), T, cc2) + // + // to two successives branches. For that, we look for another CMOV as the + // following instruction. + // + // Without this, we would add a PHI between the two jumps, which ends up + // creating a few copies all around. For instance, for + // + // (sitofp (zext (fcmp une))) + // + // we would generate: + // + // ucomiss %xmm1, %xmm0 + // movss <1.0f>, %xmm0 + // movaps %xmm0, %xmm1 + // jne .LBB5_2 + // xorps %xmm1, %xmm1 + // .LBB5_2: + // jp .LBB5_4 + // movaps %xmm1, %xmm0 + // .LBB5_4: + // retq + // + // because this custom-inserter would have generated: + // + // A + // | \ + // | B + // | / + // C + // | \ + // | D + // | / + // E + // + // A: X = ...; Y = ... + // B: empty + // C: Z = PHI [X, A], [Y, B] + // D: empty + // E: PHI [X, C], [Z, D] + // + // If we lower both CMOVs in a single step, we can instead generate: + // + // A + // | \ + // | C + // | /| + // |/ | + // | | + // | D + // | / + // E + // + // A: X = ...; Y = ... + // D: empty + // E: PHI [X, A], [X, C], [Y, D] + // + // Which, in our sitofp/fcmp example, gives us something like: + // + // ucomiss %xmm1, %xmm0 + // movss <1.0f>, %xmm0 + // jne .LBB5_4 + // jp .LBB5_4 + // xorps %xmm0, %xmm0 + // .LBB5_4: + // retq + // + MachineInstr *CascadedCMOV = nullptr; + MachineInstr *LastCMOV = MI; + X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm()); + X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); + MachineBasicBlock::iterator NextMIIt = + std::next(MachineBasicBlock::iterator(MI)); + + // Check for case 1, where there are multiple CMOVs with the same condition + // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the + // number of jumps the most. + + if (isCMOVPseudo(MI)) { + // See if we have a string of CMOVS with the same condition. + while (NextMIIt != BB->end() && + isCMOVPseudo(NextMIIt) && + (NextMIIt->getOperand(3).getImm() == CC || + NextMIIt->getOperand(3).getImm() == OppCC)) { + LastCMOV = &*NextMIIt; + ++NextMIIt; + } + } + + // This checks for case 2, but only do this if we didn't already find + // case 1, as indicated by LastCMOV == MI. + if (LastCMOV == MI && + NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && + NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() && + NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) { + CascadedCMOV = &*NextMIIt; + } + + MachineBasicBlock *jcc1MBB = nullptr; + + // If we have a cascaded CMOV, we lower it to two successive branches to + // the same block. EFLAGS is used by both, so mark it as live in the second. + if (CascadedCMOV) { + jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, jcc1MBB); + jcc1MBB->addLiveIn(X86::EFLAGS); + } + + MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, copy0MBB); + F->insert(It, sinkMBB); + + // If the EFLAGS register isn't dead in the terminator, then claim that it's + // live into the sink and copy blocks. + const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); + + MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV; + if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) && + !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) { + copy0MBB->addLiveIn(X86::EFLAGS); + sinkMBB->addLiveIn(X86::EFLAGS); + } + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(BB); + + // Add the true and fallthrough blocks as its successors. + if (CascadedCMOV) { + // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV. + BB->addSuccessor(jcc1MBB); + + // In that case, jcc1MBB will itself fallthrough the copy0MBB, and + // jump to the sinkMBB. + jcc1MBB->addSuccessor(copy0MBB); + jcc1MBB->addSuccessor(sinkMBB); + } else { + BB->addSuccessor(copy0MBB); + } + + // The true block target of the first (or only) branch is always sinkMBB. + BB->addSuccessor(sinkMBB); + + // Create the conditional branch instruction. + unsigned Opc = X86::GetCondBranchFromCond(CC); + BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); + + if (CascadedCMOV) { + unsigned Opc2 = X86::GetCondBranchFromCond( + (X86::CondCode)CascadedCMOV->getOperand(3).getImm()); + BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB); + } + + // copy0MBB: + // %FalseValue = ... + // # fallthrough to sinkMBB + copy0MBB->addSuccessor(sinkMBB); + + // sinkMBB: + // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] + // ... + MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); + MachineBasicBlock::iterator MIItEnd = + std::next(MachineBasicBlock::iterator(LastCMOV)); + MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin(); + DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable; + MachineInstrBuilder MIB; + + // As we are creating the PHIs, we have to be careful if there is more than + // one. Later CMOVs may reference the results of earlier CMOVs, but later + // PHIs have to reference the individual true/false inputs from earlier PHIs. + // That also means that PHI construction must work forward from earlier to + // later, and that the code must maintain a mapping from earlier PHI's + // destination registers, and the registers that went into the PHI. + + for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { + unsigned DestReg = MIIt->getOperand(0).getReg(); + unsigned Op1Reg = MIIt->getOperand(1).getReg(); + unsigned Op2Reg = MIIt->getOperand(2).getReg(); + + // If this CMOV we are generating is the opposite condition from + // the jump we generated, then we have to swap the operands for the + // PHI that is going to be generated. + if (MIIt->getOperand(3).getImm() == OppCC) + std::swap(Op1Reg, Op2Reg); + + if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end()) + Op1Reg = RegRewriteTable[Op1Reg].first; + + if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end()) + Op2Reg = RegRewriteTable[Op2Reg].second; + + MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL, + TII->get(X86::PHI), DestReg) + .addReg(Op1Reg).addMBB(copy0MBB) + .addReg(Op2Reg).addMBB(thisMBB); + + // Add this PHI to the rewrite table. + RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); + } + + // If we have a cascaded CMOV, the second Jcc provides the same incoming + // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes). + if (CascadedCMOV) { + MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB); + // Copy the PHI result to the register defined by the second CMOV. + BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), + DL, TII->get(TargetOpcode::COPY), + CascadedCMOV->getOperand(0).getReg()) + .addReg(MI->getOperand(0).getReg()); + CascadedCMOV->eraseFromParent(); + } + + // Now remove the CMOV(s). + for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ) + (MIIt++)->eraseFromParent(); + + return sinkMBB; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI, + MachineBasicBlock *BB) const { + // Combine the following atomic floating-point modification pattern: + // a.store(reg OP a.load(acquire), release) + // Transform them into: + // OPss (%gpr), %xmm + // movss %xmm, (%gpr) + // Or sd equivalent for 64-bit operations. + unsigned MOp, FOp; + switch (MI->getOpcode()) { + default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP"); + case X86::RELEASE_FADD32mr: MOp = X86::MOVSSmr; FOp = X86::ADDSSrm; break; + case X86::RELEASE_FADD64mr: MOp = X86::MOVSDmr; FOp = X86::ADDSDrm; break; + } + const X86InstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + MachineOperand MSrc = MI->getOperand(0); + unsigned VSrc = MI->getOperand(5).getReg(); + const MachineOperand &Disp = MI->getOperand(3); + MachineOperand ZeroDisp = MachineOperand::CreateImm(0); + bool hasDisp = Disp.isGlobal() || Disp.isImm(); + if (hasDisp && MSrc.isReg()) + MSrc.setIsKill(false); + MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp)) + .addOperand(/*Base=*/MSrc) + .addImm(/*Scale=*/1) + .addReg(/*Index=*/0) + .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0) + .addReg(0); + MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp), + MRI.createVirtualRegister(MRI.getRegClass(VSrc))) + .addReg(VSrc) + .addOperand(/*Base=*/MSrc) + .addImm(/*Scale=*/1) + .addReg(/*Index=*/0) + .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0) + .addReg(/*Segment=*/0); + MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill); + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, + MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + + assert(MF->shouldSplitStack()); + + const bool Is64Bit = Subtarget->is64Bit(); + const bool IsLP64 = Subtarget->isTarget64BitLP64(); + + const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; + const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30; + + // BB: + // ... [Till the alloca] + // If stacklet is not large enough, jump to mallocMBB + // + // bumpMBB: + // Allocate by subtracting from RSP + // Jump to continueMBB + // + // mallocMBB: + // Allocate by call to runtime + // + // continueMBB: + // ... + // [rest of original BB] + // + + MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); + + MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterClass *AddrRegClass = + getRegClassFor(getPointerTy(MF->getDataLayout())); + + unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), + bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), + tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), + SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), + sizeVReg = MI->getOperand(1).getReg(), + physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP; + + MachineFunction::iterator MBBIter = ++BB->getIterator(); + + MF->insert(MBBIter, bumpMBB); + MF->insert(MBBIter, mallocMBB); + MF->insert(MBBIter, continueMBB); + + continueMBB->splice(continueMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + continueMBB->transferSuccessorsAndUpdatePHIs(BB); + + // Add code to the main basic block to check if the stack limit has been hit, + // and if so, jump to mallocMBB otherwise to bumpMBB. + BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); + BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) + .addReg(tmpSPVReg).addReg(sizeVReg); + BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr)) + .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) + .addReg(SPLimitVReg); + BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB); + + // bumpMBB simply decreases the stack pointer, since we know the current + // stacklet has enough space. + BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) + .addReg(SPLimitVReg); + BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) + .addReg(SPLimitVReg); + BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); + + // Calls into a routine in libgcc to allocate more space from the heap. + const uint32_t *RegMask = + Subtarget->getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C); + if (IsLP64) { + BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) + .addReg(sizeVReg); + BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) + .addExternalSymbol("__morestack_allocate_stack_space") + .addRegMask(RegMask) + .addReg(X86::RDI, RegState::Implicit) + .addReg(X86::RAX, RegState::ImplicitDefine); + } else if (Is64Bit) { + BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI) + .addReg(sizeVReg); + BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) + .addExternalSymbol("__morestack_allocate_stack_space") + .addRegMask(RegMask) + .addReg(X86::EDI, RegState::Implicit) + .addReg(X86::EAX, RegState::ImplicitDefine); + } else { + BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) + .addImm(12); + BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); + BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) + .addExternalSymbol("__morestack_allocate_stack_space") + .addRegMask(RegMask) + .addReg(X86::EAX, RegState::ImplicitDefine); + } + + if (!Is64Bit) + BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) + .addImm(16); + + BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) + .addReg(IsLP64 ? X86::RAX : X86::EAX); + BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); + + // Set up the CFG correctly. + BB->addSuccessor(bumpMBB); + BB->addSuccessor(mallocMBB); + mallocMBB->addSuccessor(continueMBB); + bumpMBB->addSuccessor(continueMBB); + + // Take care of the PHI nodes. + BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), + MI->getOperand(0).getReg()) + .addReg(mallocPtrVReg).addMBB(mallocMBB) + .addReg(bumpSPPtrVReg).addMBB(bumpMBB); + + // Delete the original pseudo instruction. + MI->eraseFromParent(); + + // And we're done. + return continueMBB; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, + MachineBasicBlock *BB) const { + assert(!Subtarget->isTargetMachO()); + DebugLoc DL = MI->getDebugLoc(); + MachineInstr *ResumeMI = Subtarget->getFrameLowering()->emitStackProbe( + *BB->getParent(), *BB, MI, DL, false); + MachineBasicBlock *ResumeBB = ResumeMI->getParent(); + MI->eraseFromParent(); // The pseudo instruction is gone now. + return ResumeBB; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI, + MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); + MachineBasicBlock *TargetMBB = MI->getOperand(0).getMBB(); + DebugLoc DL = MI->getDebugLoc(); + + assert(!isAsynchronousEHPersonality( + classifyEHPersonality(MF->getFunction()->getPersonalityFn())) && + "SEH does not use catchret!"); + + // Only 32-bit EH needs to worry about manually restoring stack pointers. + if (!Subtarget->is32Bit()) + return BB; + + // C++ EH creates a new target block to hold the restore code, and wires up + // the new block to the return destination with a normal JMP_4. + MachineBasicBlock *RestoreMBB = + MF->CreateMachineBasicBlock(BB->getBasicBlock()); + assert(BB->succ_size() == 1); + MF->insert(std::next(BB->getIterator()), RestoreMBB); + RestoreMBB->transferSuccessorsAndUpdatePHIs(BB); + BB->addSuccessor(RestoreMBB); + MI->getOperand(0).setMBB(RestoreMBB); + + auto RestoreMBBI = RestoreMBB->begin(); + BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE)); + BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB); + return BB; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredCatchPad(MachineInstr *MI, + MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + const Constant *PerFn = MF->getFunction()->getPersonalityFn(); + bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn)); + // Only 32-bit SEH requires special handling for catchpad. + if (IsSEH && Subtarget->is32Bit()) { + const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE)); + } + MI->eraseFromParent(); + return BB; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, + MachineBasicBlock *BB) const { + // This is pretty easy. We're taking the value that we received from + // our load from the relocation, sticking it in either RDI (x86-64) + // or EAX and doing an indirect call. The return value will then + // be in the normal return register. + MachineFunction *F = BB->getParent(); + const X86InstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + + assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); + assert(MI->getOperand(3).isGlobal() && "This should be a global"); + + // Get a register mask for the lowered call. + // FIXME: The 32-bit calls have non-standard calling conventions. Use a + // proper register mask. + const uint32_t *RegMask = + Subtarget->is64Bit() ? + Subtarget->getRegisterInfo()->getDarwinTLSCallPreservedMask() : + Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C); + if (Subtarget->is64Bit()) { + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, + TII->get(X86::MOV64rm), X86::RDI) + .addReg(X86::RIP) + .addImm(0).addReg(0) + .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, + MI->getOperand(3).getTargetFlags()) + .addReg(0); + MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); + addDirectMem(MIB, X86::RDI); + MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); + } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) { + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, + TII->get(X86::MOV32rm), X86::EAX) + .addReg(0) + .addImm(0).addReg(0) + .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, + MI->getOperand(3).getTargetFlags()) + .addReg(0); + MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); + addDirectMem(MIB, X86::EAX); + MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); + } else { + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, + TII->get(X86::MOV32rm), X86::EAX) + .addReg(TII->getGlobalBaseReg(F)) + .addImm(0).addReg(0) + .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, + MI->getOperand(3).getTargetFlags()) + .addReg(0); + MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); + addDirectMem(MIB, X86::EAX); + MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); + } + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + +MachineBasicBlock * +X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const { + DebugLoc DL = MI->getDebugLoc(); + MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = ++MBB->getIterator(); + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + unsigned DstReg; + unsigned MemOpndSlot = 0; + + unsigned CurOp = 0; + + DstReg = MI->getOperand(CurOp++).getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(DstReg); + assert(RC->hasType(MVT::i32) && "Invalid destination!"); + unsigned mainDstReg = MRI.createVirtualRegister(RC); + unsigned restoreDstReg = MRI.createVirtualRegister(RC); + + MemOpndSlot = CurOp; + + MVT PVT = getPointerTy(MF->getDataLayout()); + assert((PVT == MVT::i64 || PVT == MVT::i32) && + "Invalid Pointer Size!"); + + // For v = setjmp(buf), we generate + // + // thisMBB: + // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB + // SjLjSetup restoreMBB + // + // mainMBB: + // v_main = 0 + // + // sinkMBB: + // v = phi(main, restore) + // + // restoreMBB: + // if base pointer being used, load it from frame + // v_restore = 1 + + MachineBasicBlock *thisMBB = MBB; + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); + MF->push_back(restoreMBB); + restoreMBB->setHasAddressTaken(); + + MachineInstrBuilder MIB; + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // thisMBB: + unsigned PtrStoreOpc = 0; + unsigned LabelReg = 0; + const int64_t LabelOffset = 1 * PVT.getStoreSize(); + Reloc::Model RM = MF->getTarget().getRelocationModel(); + bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && + (RM == Reloc::Static || RM == Reloc::DynamicNoPIC); + + // Prepare IP either in reg or imm. + if (!UseImmLabel) { + PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; + const TargetRegisterClass *PtrRC = getRegClassFor(PVT); + LabelReg = MRI.createVirtualRegister(PtrRC); + if (Subtarget->is64Bit()) { + MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg) + .addReg(X86::RIP) + .addImm(0) + .addReg(0) + .addMBB(restoreMBB) + .addReg(0); + } else { + const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII); + MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg) + .addReg(XII->getGlobalBaseReg(MF)) + .addImm(0) + .addReg(0) + .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference()) + .addReg(0); + } + } else + PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; + // Store IP + MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc)); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) + MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset); + else + MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + } + if (!UseImmLabel) + MIB.addReg(LabelReg); + else + MIB.addMBB(restoreMBB); + MIB.setMemRefs(MMOBegin, MMOEnd); + // Setup + MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) + .addMBB(restoreMBB); + + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + MIB.addRegMask(RegInfo->getNoPreservedMask()); + thisMBB->addSuccessor(mainMBB); + thisMBB->addSuccessor(restoreMBB); + + // mainMBB: + // EAX = 0 + BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg); + mainMBB->addSuccessor(sinkMBB); + + // sinkMBB: + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(X86::PHI), DstReg) + .addReg(mainDstReg).addMBB(mainMBB) + .addReg(restoreDstReg).addMBB(restoreMBB); + + // restoreMBB: + if (RegInfo->hasBasePointer(*MF)) { + const bool Uses64BitFramePtr = + Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64(); + X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); + X86FI->setRestoreBasePointer(MF); + unsigned FramePtr = RegInfo->getFrameRegister(*MF); + unsigned BasePtr = RegInfo->getBaseRegister(); + unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm; + addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr), + FramePtr, true, X86FI->getRestoreBasePointerOffset()) + .setMIFlag(MachineInstr::FrameSetup); + } + BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); + BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB); + restoreMBB->addSuccessor(sinkMBB); + + MI->eraseFromParent(); + return sinkMBB; +} + +MachineBasicBlock * +X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const { + DebugLoc DL = MI->getDebugLoc(); + MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + MVT PVT = getPointerTy(MF->getDataLayout()); + assert((PVT == MVT::i64 || PVT == MVT::i32) && + "Invalid Pointer Size!"); + + const TargetRegisterClass *RC = + (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; + unsigned Tmp = MRI.createVirtualRegister(RC); + // Since FP is only updated here but NOT referenced, it's treated as GPR. + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; + unsigned SP = RegInfo->getStackRegister(); + + MachineInstrBuilder MIB; + + const int64_t LabelOffset = 1 * PVT.getStoreSize(); + const int64_t SPOffset = 2 * PVT.getStoreSize(); + + unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; + unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r; + + // Reload FP + MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(i)); + MIB.setMemRefs(MMOBegin, MMOEnd); + // Reload IP + MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) + MIB.addDisp(MI->getOperand(i), LabelOffset); + else + MIB.addOperand(MI->getOperand(i)); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + // Reload SP + MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) + MIB.addDisp(MI->getOperand(i), SPOffset); + else + MIB.addOperand(MI->getOperand(i)); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + // Jump + BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp); + + MI->eraseFromParent(); + return MBB; +} + +// Replace 213-type (isel default) FMA3 instructions with 231-type for +// accumulator loops. Writing back to the accumulator allows the coalescer +// to remove extra copies in the loop. +// FIXME: Do this on AVX512. We don't support 231 variants yet (PR23937). +MachineBasicBlock * +X86TargetLowering::emitFMA3Instr(MachineInstr *MI, + MachineBasicBlock *MBB) const { + MachineOperand &AddendOp = MI->getOperand(3); + + // Bail out early if the addend isn't a register - we can't switch these. + if (!AddendOp.isReg()) + return MBB; + + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Check whether the addend is defined by a PHI: + assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?"); + MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg()); + if (!AddendDef.isPHI()) + return MBB; + + // Look for the following pattern: + // loop: + // %addend = phi [%entry, 0], [%loop, %result] + // ... + // %result<tied1> = FMA213 %m2<tied0>, %m1, %addend + + // Replace with: + // loop: + // %addend = phi [%entry, 0], [%loop, %result] + // ... + // %result<tied1> = FMA231 %addend<tied0>, %m1, %m2 + + for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) { + assert(AddendDef.getOperand(i).isReg()); + MachineOperand PHISrcOp = AddendDef.getOperand(i); + MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg()); + if (&PHISrcInst == MI) { + // Found a matching instruction. + unsigned NewFMAOpc = 0; + switch (MI->getOpcode()) { + case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break; + case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break; + case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break; + case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break; + case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break; + case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break; + case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break; + case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break; + case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break; + case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break; + case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break; + case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break; + case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break; + case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break; + case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break; + case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break; + case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break; + case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break; + case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break; + case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break; + + case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break; + case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break; + case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break; + case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break; + case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break; + case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break; + case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break; + case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break; + case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break; + case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break; + case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break; + case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break; + default: llvm_unreachable("Unrecognized FMA variant."); + } + + const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); + MachineInstrBuilder MIB = + BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(3)) + .addOperand(MI->getOperand(2)) + .addOperand(MI->getOperand(1)); + MBB->insert(MachineBasicBlock::iterator(MI), MIB); + MI->eraseFromParent(); + } + } + + return MBB; +} + +MachineBasicBlock * +X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *BB) const { + switch (MI->getOpcode()) { + default: llvm_unreachable("Unexpected instr type to insert"); + case X86::TAILJMPd64: + case X86::TAILJMPr64: + case X86::TAILJMPm64: + case X86::TAILJMPd64_REX: + case X86::TAILJMPr64_REX: + case X86::TAILJMPm64_REX: + llvm_unreachable("TAILJMP64 would not be touched here."); + case X86::TCRETURNdi64: + case X86::TCRETURNri64: + case X86::TCRETURNmi64: + return BB; + case X86::WIN_ALLOCA: + return EmitLoweredWinAlloca(MI, BB); + case X86::CATCHRET: + return EmitLoweredCatchRet(MI, BB); + case X86::CATCHPAD: + return EmitLoweredCatchPad(MI, BB); + case X86::SEG_ALLOCA_32: + case X86::SEG_ALLOCA_64: + return EmitLoweredSegAlloca(MI, BB); + case X86::TLSCall_32: + case X86::TLSCall_64: + return EmitLoweredTLSCall(MI, BB); + case X86::CMOV_FR32: + case X86::CMOV_FR64: + case X86::CMOV_FR128: + case X86::CMOV_GR8: + case X86::CMOV_GR16: + case X86::CMOV_GR32: + case X86::CMOV_RFP32: + case X86::CMOV_RFP64: + case X86::CMOV_RFP80: + case X86::CMOV_V2F64: + case X86::CMOV_V2I64: + case X86::CMOV_V4F32: + case X86::CMOV_V4F64: + case X86::CMOV_V4I64: + case X86::CMOV_V16F32: + case X86::CMOV_V8F32: + case X86::CMOV_V8F64: + case X86::CMOV_V8I64: + case X86::CMOV_V8I1: + case X86::CMOV_V16I1: + case X86::CMOV_V32I1: + case X86::CMOV_V64I1: + return EmitLoweredSelect(MI, BB); + + case X86::RDFLAGS32: + case X86::RDFLAGS64: { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + unsigned PushF = + MI->getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64; + unsigned Pop = + MI->getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r; + BuildMI(*BB, MI, DL, TII->get(PushF)); + BuildMI(*BB, MI, DL, TII->get(Pop), MI->getOperand(0).getReg()); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; + } + + case X86::WRFLAGS32: + case X86::WRFLAGS64: { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + unsigned Push = + MI->getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r; + unsigned PopF = + MI->getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64; + BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI->getOperand(0).getReg()); + BuildMI(*BB, MI, DL, TII->get(PopF)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; + } + + case X86::RELEASE_FADD32mr: + case X86::RELEASE_FADD64mr: + return EmitLoweredAtomicFP(MI, BB); + + case X86::FP32_TO_INT16_IN_MEM: + case X86::FP32_TO_INT32_IN_MEM: + case X86::FP32_TO_INT64_IN_MEM: + case X86::FP64_TO_INT16_IN_MEM: + case X86::FP64_TO_INT32_IN_MEM: + case X86::FP64_TO_INT64_IN_MEM: + case X86::FP80_TO_INT16_IN_MEM: + case X86::FP80_TO_INT32_IN_MEM: + case X86::FP80_TO_INT64_IN_MEM: { + MachineFunction *F = BB->getParent(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + + // Change the floating point control register to use "round towards zero" + // mode when truncating to an integer value. + int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); + addFrameReference(BuildMI(*BB, MI, DL, + TII->get(X86::FNSTCW16m)), CWFrameIdx); + + // Load the old value of the high byte of the control word... + unsigned OldCW = + F->getRegInfo().createVirtualRegister(&X86::GR16RegClass); + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), + CWFrameIdx); + + // Set the high part to be round to zero... + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) + .addImm(0xC7F); + + // Reload the modified control word now... + addFrameReference(BuildMI(*BB, MI, DL, + TII->get(X86::FLDCW16m)), CWFrameIdx); + + // Restore the memory image of control word to original value + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) + .addReg(OldCW); + + // Get the X86 opcode to use. + unsigned Opc; + switch (MI->getOpcode()) { + default: llvm_unreachable("illegal opcode!"); + case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; + case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; + case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; + case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; + case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; + case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; + case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; + case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; + case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; + } + + X86AddressMode AM; + MachineOperand &Op = MI->getOperand(0); + if (Op.isReg()) { + AM.BaseType = X86AddressMode::RegBase; + AM.Base.Reg = Op.getReg(); + } else { + AM.BaseType = X86AddressMode::FrameIndexBase; + AM.Base.FrameIndex = Op.getIndex(); + } + Op = MI->getOperand(1); + if (Op.isImm()) + AM.Scale = Op.getImm(); + Op = MI->getOperand(2); + if (Op.isImm()) + AM.IndexReg = Op.getImm(); + Op = MI->getOperand(3); + if (Op.isGlobal()) { + AM.GV = Op.getGlobal(); + } else { + AM.Disp = Op.getImm(); + } + addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) + .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); + + // Reload the original control word now. + addFrameReference(BuildMI(*BB, MI, DL, + TII->get(X86::FLDCW16m)), CWFrameIdx); + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; + } + // String/text processing lowering. + case X86::PCMPISTRM128REG: + case X86::VPCMPISTRM128REG: + case X86::PCMPISTRM128MEM: + case X86::VPCMPISTRM128MEM: + case X86::PCMPESTRM128REG: + case X86::VPCMPESTRM128REG: + case X86::PCMPESTRM128MEM: + case X86::VPCMPESTRM128MEM: + assert(Subtarget->hasSSE42() && + "Target must have SSE4.2 or AVX features enabled"); + return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo()); + + // String/text processing lowering. + case X86::PCMPISTRIREG: + case X86::VPCMPISTRIREG: + case X86::PCMPISTRIMEM: + case X86::VPCMPISTRIMEM: + case X86::PCMPESTRIREG: + case X86::VPCMPESTRIREG: + case X86::PCMPESTRIMEM: + case X86::VPCMPESTRIMEM: + assert(Subtarget->hasSSE42() && + "Target must have SSE4.2 or AVX features enabled"); + return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo()); + + // Thread synchronization. + case X86::MONITOR: + return EmitMonitor(MI, BB, Subtarget); + // PKU feature + case X86::WRPKRU: + return EmitWRPKRU(MI, BB, Subtarget); + case X86::RDPKRU: + return EmitRDPKRU(MI, BB, Subtarget); + // xbegin + case X86::XBEGIN: + return EmitXBegin(MI, BB, Subtarget->getInstrInfo()); + + case X86::VASTART_SAVE_XMM_REGS: + return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); + + case X86::VAARG_64: + return EmitVAARG64WithCustomInserter(MI, BB); + + case X86::EH_SjLj_SetJmp32: + case X86::EH_SjLj_SetJmp64: + return emitEHSjLjSetJmp(MI, BB); + + case X86::EH_SjLj_LongJmp32: + case X86::EH_SjLj_LongJmp64: + return emitEHSjLjLongJmp(MI, BB); + + case TargetOpcode::STATEPOINT: + // As an implementation detail, STATEPOINT shares the STACKMAP format at + // this point in the process. We diverge later. + return emitPatchPoint(MI, BB); + + case TargetOpcode::STACKMAP: + case TargetOpcode::PATCHPOINT: + return emitPatchPoint(MI, BB); + + case X86::VFMADDPDr213r: + case X86::VFMADDPSr213r: + case X86::VFMADDSDr213r: + case X86::VFMADDSSr213r: + case X86::VFMSUBPDr213r: + case X86::VFMSUBPSr213r: + case X86::VFMSUBSDr213r: + case X86::VFMSUBSSr213r: + case X86::VFNMADDPDr213r: + case X86::VFNMADDPSr213r: + case X86::VFNMADDSDr213r: + case X86::VFNMADDSSr213r: + case X86::VFNMSUBPDr213r: + case X86::VFNMSUBPSr213r: + case X86::VFNMSUBSDr213r: + case X86::VFNMSUBSSr213r: + case X86::VFMADDSUBPDr213r: + case X86::VFMADDSUBPSr213r: + case X86::VFMSUBADDPDr213r: + case X86::VFMSUBADDPSr213r: + case X86::VFMADDPDr213rY: + case X86::VFMADDPSr213rY: + case X86::VFMSUBPDr213rY: + case X86::VFMSUBPSr213rY: + case X86::VFNMADDPDr213rY: + case X86::VFNMADDPSr213rY: + case X86::VFNMSUBPDr213rY: + case X86::VFNMSUBPSr213rY: + case X86::VFMADDSUBPDr213rY: + case X86::VFMADDSUBPSr213rY: + case X86::VFMSUBADDPDr213rY: + case X86::VFMSUBADDPSr213rY: + return emitFMA3Instr(MI, BB); + } +} + +//===----------------------------------------------------------------------===// +// X86 Optimization Hooks +//===----------------------------------------------------------------------===// + +void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + unsigned BitWidth = KnownZero.getBitWidth(); + unsigned Opc = Op.getOpcode(); + assert((Opc >= ISD::BUILTIN_OP_END || + Opc == ISD::INTRINSIC_WO_CHAIN || + Opc == ISD::INTRINSIC_W_CHAIN || + Opc == ISD::INTRINSIC_VOID) && + "Should use MaskedValueIsZero if you don't know whether Op" + " is a target node!"); + + KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. + switch (Opc) { + default: break; + case X86ISD::ADD: + case X86ISD::SUB: + case X86ISD::ADC: + case X86ISD::SBB: + case X86ISD::SMUL: + case X86ISD::UMUL: + case X86ISD::INC: + case X86ISD::DEC: + case X86ISD::OR: + case X86ISD::XOR: + case X86ISD::AND: + // These nodes' second result is a boolean. + if (Op.getResNo() == 0) + break; + // Fallthrough + case X86ISD::SETCC: + KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); + break; + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned NumLoBits = 0; + switch (IntId) { + default: break; + case Intrinsic::x86_sse_movmsk_ps: + case Intrinsic::x86_avx_movmsk_ps_256: + case Intrinsic::x86_sse2_movmsk_pd: + case Intrinsic::x86_avx_movmsk_pd_256: + case Intrinsic::x86_mmx_pmovmskb: + case Intrinsic::x86_sse2_pmovmskb_128: + case Intrinsic::x86_avx2_pmovmskb: { + // High bits of movmskp{s|d}, pmovmskb are known zero. + switch (IntId) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; + case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; + case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; + case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; + case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; + case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; + case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; + } + KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); + break; + } + } + break; + } + } +} + +unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( + SDValue Op, + const SelectionDAG &, + unsigned Depth) const { + // SETCC_CARRY sets the dest to ~0 for true or 0 for false. + if (Op.getOpcode() == X86ISD::SETCC_CARRY) + return Op.getValueType().getScalarSizeInBits(); + + // Fallback case. + return 1; +} + +/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the +/// node is a GlobalAddress + offset. +bool X86TargetLowering::isGAPlusOffset(SDNode *N, + const GlobalValue* &GA, + int64_t &Offset) const { + if (N->getOpcode() == X86ISD::Wrapper) { + if (isa<GlobalAddressSDNode>(N->getOperand(0))) { + GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); + Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); + return true; + } + } + return TargetLowering::isGAPlusOffset(N, GA, Offset); +} + +/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. +/// FIXME: This could be expanded to support 512 bit vectors as well. +static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget* Subtarget) { + SDLoc dl(N); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + MVT VT = SVOp->getSimpleValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + + if (V1.getOpcode() == ISD::CONCAT_VECTORS && + V2.getOpcode() == ISD::CONCAT_VECTORS) { + // + // 0,0,0,... + // | + // V UNDEF BUILD_VECTOR UNDEF + // \ / \ / + // CONCAT_VECTOR CONCAT_VECTOR + // \ / + // \ / + // RESULT: V + zero extended + // + if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || + V2.getOperand(1).getOpcode() != ISD::UNDEF || + V1.getOperand(1).getOpcode() != ISD::UNDEF) + return SDValue(); + + if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) + return SDValue(); + + // To match the shuffle mask, the first half of the mask should + // be exactly the first vector, and all the rest a splat with the + // first element of the second one. + for (unsigned i = 0; i != NumElems/2; ++i) + if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || + !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) + return SDValue(); + + // If V1 is coming from a vector load then just fold to a VZEXT_LOAD. + if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) { + if (Ld->hasNUsesOfValue(1, 0)) { + SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other); + SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, + Ld->getMemoryVT(), + Ld->getPointerInfo(), + Ld->getAlignment(), + false/*isVolatile*/, true/*ReadMem*/, + false/*WriteMem*/); + + // Make sure the newly-created LOAD is in the same position as Ld in + // terms of dependency. We create a TokenFactor for Ld and ResNode, + // and update uses of Ld's output chain to use the TokenFactor. + if (Ld->hasAnyUseOfValue(1)) { + SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + SDValue(Ld, 1), SDValue(ResNode.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1), + SDValue(ResNode.getNode(), 1)); + } + + return DAG.getBitcast(VT, ResNode); + } + } + + // Emit a zeroed vector and insert the desired subvector on its + // first half. + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); + SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl); + return DCI.CombineTo(N, InsV); + } + + return SDValue(); +} + +/// \brief Combine an arbitrary chain of shuffles into a single instruction if +/// possible. +/// +/// This is the leaf of the recursive combinine below. When we have found some +/// chain of single-use x86 shuffle instructions and accumulated the combined +/// shuffle mask represented by them, this will try to pattern match that mask +/// into either a single instruction if there is a special purpose instruction +/// for this operation, or into a PSHUFB instruction which is a fully general +/// instruction but should only be used to replace chains over a certain depth. +static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, + int Depth, bool HasPSHUFB, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + assert(!Mask.empty() && "Cannot combine an empty shuffle mask!"); + + // Find the operand that enters the chain. Note that multiple uses are OK + // here, we're not going to remove the operand we find. + SDValue Input = Op.getOperand(0); + while (Input.getOpcode() == ISD::BITCAST) + Input = Input.getOperand(0); + + MVT VT = Input.getSimpleValueType(); + MVT RootVT = Root.getSimpleValueType(); + SDLoc DL(Root); + + if (Mask.size() == 1) { + int Index = Mask[0]; + assert((Index >= 0 || Index == SM_SentinelUndef || + Index == SM_SentinelZero) && + "Invalid shuffle index found!"); + + // We may end up with an accumulated mask of size 1 as a result of + // widening of shuffle operands (see function canWidenShuffleElements). + // If the only shuffle index is equal to SM_SentinelZero then propagate + // a zero vector. Otherwise, the combine shuffle mask is a no-op shuffle + // mask, and therefore the entire chain of shuffles can be folded away. + if (Index == SM_SentinelZero) + DCI.CombineTo(Root.getNode(), getZeroVector(RootVT, Subtarget, DAG, DL)); + else + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input), + /*AddTo*/ true); + return true; + } + + // Use the float domain if the operand type is a floating point type. + bool FloatDomain = VT.isFloatingPoint(); + + // For floating point shuffles, we don't have free copies in the shuffle + // instructions or the ability to load as part of the instruction, so + // canonicalize their shuffles to UNPCK or MOV variants. + // + // Note that even with AVX we prefer the PSHUFD form of shuffle for integer + // vectors because it can have a load folded into it that UNPCK cannot. This + // doesn't preclude something switching to the shorter encoding post-RA. + // + // FIXME: Should teach these routines about AVX vector widths. + if (FloatDomain && VT.is128BitVector()) { + if (Mask.equals({0, 0}) || Mask.equals({1, 1})) { + bool Lo = Mask.equals({0, 0}); + unsigned Shuffle; + MVT ShuffleVT; + // Check if we have SSE3 which will let us use MOVDDUP. That instruction + // is no slower than UNPCKLPD but has the option to fold the input operand + // into even an unaligned memory load. + if (Lo && Subtarget->hasSSE3()) { + Shuffle = X86ISD::MOVDDUP; + ShuffleVT = MVT::v2f64; + } else { + // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller + // than the UNPCK variants. + Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS; + ShuffleVT = MVT::v4f32; + } + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + Op = DAG.getBitcast(ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + if (Shuffle == X86ISD::MOVDDUP) + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); + else + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), + /*AddTo*/ true); + return true; + } + if (Subtarget->hasSSE3() && + (Mask.equals({0, 0, 2, 2}) || Mask.equals({1, 1, 3, 3}))) { + bool Lo = Mask.equals({0, 0, 2, 2}); + unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP; + MVT ShuffleVT = MVT::v4f32; + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + Op = DAG.getBitcast(ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), + /*AddTo*/ true); + return true; + } + if (Mask.equals({0, 0, 1, 1}) || Mask.equals({2, 2, 3, 3})) { + bool Lo = Mask.equals({0, 0, 1, 1}); + unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; + MVT ShuffleVT = MVT::v4f32; + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + Op = DAG.getBitcast(ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), + /*AddTo*/ true); + return true; + } + } + + // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK + // variants as none of these have single-instruction variants that are + // superior to the UNPCK formulation. + if (!FloatDomain && VT.is128BitVector() && + (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) || + Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) || + Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) || + Mask.equals( + {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15}))) { + bool Lo = Mask[0] == 0; + unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + MVT ShuffleVT; + switch (Mask.size()) { + case 8: + ShuffleVT = MVT::v8i16; + break; + case 16: + ShuffleVT = MVT::v16i8; + break; + default: + llvm_unreachable("Impossible mask size!"); + }; + Op = DAG.getBitcast(ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), + /*AddTo*/ true); + return true; + } + + // Don't try to re-form single instruction chains under any circumstances now + // that we've done encoding canonicalization for them. + if (Depth < 2) + return false; + + // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we + // can replace them with a single PSHUFB instruction profitably. Intel's + // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but + // in practice PSHUFB tends to be *very* fast so we're more aggressive. + if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) { + SmallVector<SDValue, 16> PSHUFBMask; + int NumBytes = VT.getSizeInBits() / 8; + int Ratio = NumBytes / Mask.size(); + for (int i = 0; i < NumBytes; ++i) { + if (Mask[i / Ratio] == SM_SentinelUndef) { + PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8)); + continue; + } + int M = Mask[i / Ratio] != SM_SentinelZero + ? Ratio * Mask[i / Ratio] + i % Ratio + : 255; + PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8)); + } + MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); + Op = DAG.getBitcast(ByteVT, Input); + DCI.AddToWorklist(Op.getNode()); + SDValue PSHUFBMaskOp = + DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask); + DCI.AddToWorklist(PSHUFBMaskOp.getNode()); + Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Op), + /*AddTo*/ true); + return true; + } + + // Failed to find any combines. + return false; +} + +/// \brief Fully generic combining of x86 shuffle instructions. +/// +/// This should be the last combine run over the x86 shuffle instructions. Once +/// they have been fully optimized, this will recursively consider all chains +/// of single-use shuffle instructions, build a generic model of the cumulative +/// shuffle operation, and check for simpler instructions which implement this +/// operation. We use this primarily for two purposes: +/// +/// 1) Collapse generic shuffles to specialized single instructions when +/// equivalent. In most cases, this is just an encoding size win, but +/// sometimes we will collapse multiple generic shuffles into a single +/// special-purpose shuffle. +/// 2) Look for sequences of shuffle instructions with 3 or more total +/// instructions, and replace them with the slightly more expensive SSSE3 +/// PSHUFB instruction if available. We do this as the last combining step +/// to ensure we avoid using PSHUFB if we can implement the shuffle with +/// a suitable short sequence of other instructions. The PHUFB will either +/// use a register or have to read from memory and so is slightly (but only +/// slightly) more expensive than the other shuffle instructions. +/// +/// Because this is inherently a quadratic operation (for each shuffle in +/// a chain, we recurse up the chain), the depth is limited to 8 instructions. +/// This should never be an issue in practice as the shuffle lowering doesn't +/// produce sequences of more than 8 instructions. +/// +/// FIXME: We will currently miss some cases where the redundant shuffling +/// would simplify under the threshold for PSHUFB formation because of +/// combine-ordering. To fix this, we should do the redundant instruction +/// combining in this recursive walk. +static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, + ArrayRef<int> RootMask, + int Depth, bool HasPSHUFB, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + // Bound the depth of our recursive combine because this is ultimately + // quadratic in nature. + if (Depth > 8) + return false; + + // Directly rip through bitcasts to find the underlying operand. + while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse()) + Op = Op.getOperand(0); + + MVT VT = Op.getSimpleValueType(); + if (!VT.isVector()) + return false; // Bail if we hit a non-vector. + + assert(Root.getSimpleValueType().isVector() && + "Shuffles operate on vector types!"); + assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && + "Can only combine shuffles of the same vector register size."); + + if (!isTargetShuffle(Op.getOpcode())) + return false; + SmallVector<int, 16> OpMask; + bool IsUnary; + bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary); + // We only can combine unary shuffles which we can decode the mask for. + if (!HaveMask || !IsUnary) + return false; + + assert(VT.getVectorNumElements() == OpMask.size() && + "Different mask size from vector size!"); + assert(((RootMask.size() > OpMask.size() && + RootMask.size() % OpMask.size() == 0) || + (OpMask.size() > RootMask.size() && + OpMask.size() % RootMask.size() == 0) || + OpMask.size() == RootMask.size()) && + "The smaller number of elements must divide the larger."); + int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size()); + int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size()); + assert(((RootRatio == 1 && OpRatio == 1) || + (RootRatio == 1) != (OpRatio == 1)) && + "Must not have a ratio for both incoming and op masks!"); + + SmallVector<int, 16> Mask; + Mask.reserve(std::max(OpMask.size(), RootMask.size())); + + // Merge this shuffle operation's mask into our accumulated mask. Note that + // this shuffle's mask will be the first applied to the input, followed by the + // root mask to get us all the way to the root value arrangement. The reason + // for this order is that we are recursing up the operation chain. + for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) { + int RootIdx = i / RootRatio; + if (RootMask[RootIdx] < 0) { + // This is a zero or undef lane, we're done. + Mask.push_back(RootMask[RootIdx]); + continue; + } + + int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio; + int OpIdx = RootMaskedIdx / OpRatio; + if (OpMask[OpIdx] < 0) { + // The incoming lanes are zero or undef, it doesn't matter which ones we + // are using. + Mask.push_back(OpMask[OpIdx]); + continue; + } + + // Ok, we have non-zero lanes, map them through. + Mask.push_back(OpMask[OpIdx] * OpRatio + + RootMaskedIdx % OpRatio); + } + + // See if we can recurse into the operand to combine more things. + switch (Op.getOpcode()) { + case X86ISD::PSHUFB: + HasPSHUFB = true; + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + if (Op.getOperand(0).hasOneUse() && + combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, + HasPSHUFB, DAG, DCI, Subtarget)) + return true; + break; + + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + assert(Op.getOperand(0) == Op.getOperand(1) && + "We only combine unary shuffles!"); + // We can't check for single use, we have to check that this shuffle is the + // only user. + if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && + combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, + HasPSHUFB, DAG, DCI, Subtarget)) + return true; + break; + } + + // Minor canonicalization of the accumulated shuffle mask to make it easier + // to match below. All this does is detect masks with squential pairs of + // elements, and shrink them to the half-width mask. It does this in a loop + // so it will reduce the size of the mask to the minimal width mask which + // performs an equivalent shuffle. + SmallVector<int, 16> WidenedMask; + while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { + Mask = std::move(WidenedMask); + WidenedMask.clear(); + } + + return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI, + Subtarget); +} + +/// \brief Get the PSHUF-style mask from PSHUF node. +/// +/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 +/// PSHUF-style masks that can be reused with such instructions. +static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { + MVT VT = N.getSimpleValueType(); + SmallVector<int, 4> Mask; + bool IsUnary; + bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary); + (void)HaveMask; + assert(HaveMask); + + // If we have more than 128-bits, only the low 128-bits of shuffle mask + // matter. Check that the upper masks are repeats and remove them. + if (VT.getSizeInBits() > 128) { + int LaneElts = 128 / VT.getScalarSizeInBits(); +#ifndef NDEBUG + for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i) + for (int j = 0; j < LaneElts; ++j) + assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && + "Mask doesn't repeat in high 128-bit lanes!"); +#endif + Mask.resize(LaneElts); + } + + switch (N.getOpcode()) { + case X86ISD::PSHUFD: + return Mask; + case X86ISD::PSHUFLW: + Mask.resize(4); + return Mask; + case X86ISD::PSHUFHW: + Mask.erase(Mask.begin(), Mask.begin() + 4); + for (int &M : Mask) + M -= 4; + return Mask; + default: + llvm_unreachable("No valid shuffle instruction found!"); + } +} + +/// \brief Search for a combinable shuffle across a chain ending in pshufd. +/// +/// We walk up the chain and look for a combinable shuffle, skipping over +/// shuffles that we could hoist this shuffle's transformation past without +/// altering anything. +static SDValue +combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + assert(N.getOpcode() == X86ISD::PSHUFD && + "Called with something other than an x86 128-bit half shuffle!"); + SDLoc DL(N); + + // Walk up a single-use chain looking for a combinable shuffle. Keep a stack + // of the shuffles in the chain so that we can form a fresh chain to replace + // this one. + SmallVector<SDValue, 8> Chain; + SDValue V = N.getOperand(0); + for (; V.hasOneUse(); V = V.getOperand(0)) { + switch (V.getOpcode()) { + default: + return SDValue(); // Nothing combined! + + case ISD::BITCAST: + // Skip bitcasts as we always know the type for the target specific + // instructions. + continue; + + case X86ISD::PSHUFD: + // Found another dword shuffle. + break; + + case X86ISD::PSHUFLW: + // Check that the low words (being shuffled) are the identity in the + // dword shuffle, and the high words are self-contained. + if (Mask[0] != 0 || Mask[1] != 1 || + !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) + return SDValue(); + + Chain.push_back(V); + continue; + + case X86ISD::PSHUFHW: + // Check that the high words (being shuffled) are the identity in the + // dword shuffle, and the low words are self-contained. + if (Mask[2] != 2 || Mask[3] != 3 || + !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) + return SDValue(); + + Chain.push_back(V); + continue; + + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword + // shuffle into a preceding word shuffle. + if (V.getSimpleValueType().getVectorElementType() != MVT::i8 && + V.getSimpleValueType().getVectorElementType() != MVT::i16) + return SDValue(); + + // Search for a half-shuffle which we can combine with. + unsigned CombineOp = + V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; + if (V.getOperand(0) != V.getOperand(1) || + !V->isOnlyUserOf(V.getOperand(0).getNode())) + return SDValue(); + Chain.push_back(V); + V = V.getOperand(0); + do { + switch (V.getOpcode()) { + default: + return SDValue(); // Nothing to combine. + + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + if (V.getOpcode() == CombineOp) + break; + + Chain.push_back(V); + + // Fallthrough! + case ISD::BITCAST: + V = V.getOperand(0); + continue; + } + break; + } while (V.hasOneUse()); + break; + } + // Break out of the loop if we break out of the switch. + break; + } + + if (!V.hasOneUse()) + // We fell out of the loop without finding a viable combining instruction. + return SDValue(); + + // Merge this node's mask and our incoming mask. + SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); + for (int &M : Mask) + M = VMask[M]; + V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), + getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); + + // Rebuild the chain around this new shuffle. + while (!Chain.empty()) { + SDValue W = Chain.pop_back_val(); + + if (V.getValueType() != W.getOperand(0).getValueType()) + V = DAG.getBitcast(W.getOperand(0).getValueType(), V); + + switch (W.getOpcode()) { + default: + llvm_unreachable("Only PSHUF and UNPCK instructions get here!"); + + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V); + break; + + case X86ISD::PSHUFD: + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1)); + break; + } + } + if (V.getValueType() != N.getValueType()) + V = DAG.getBitcast(N.getValueType(), V); + + // Return the new chain to replace N. + return V; +} + +/// \brief Search for a combinable shuffle across a chain ending in pshuflw or +/// pshufhw. +/// +/// We walk up the chain, skipping shuffles of the other half and looking +/// through shuffles which switch halves trying to find a shuffle of the same +/// pair of dwords. +static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + assert( + (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) && + "Called with something other than an x86 128-bit half shuffle!"); + SDLoc DL(N); + unsigned CombineOpcode = N.getOpcode(); + + // Walk up a single-use chain looking for a combinable shuffle. + SDValue V = N.getOperand(0); + for (; V.hasOneUse(); V = V.getOperand(0)) { + switch (V.getOpcode()) { + default: + return false; // Nothing combined! + + case ISD::BITCAST: + // Skip bitcasts as we always know the type for the target specific + // instructions. + continue; + + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + if (V.getOpcode() == CombineOpcode) + break; + + // Other-half shuffles are no-ops. + continue; + } + // Break out of the loop if we break out of the switch. + break; + } + + if (!V.hasOneUse()) + // We fell out of the loop without finding a viable combining instruction. + return false; + + // Combine away the bottom node as its shuffle will be accumulated into + // a preceding shuffle. + DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); + + // Record the old value. + SDValue Old = V; + + // Merge this node's mask and our incoming mask (adjusted to account for all + // the pshufd instructions encountered). + SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); + for (int &M : Mask) + M = VMask[M]; + V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0), + getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); + + // Check that the shuffles didn't cancel each other out. If not, we need to + // combine to the new one. + if (Old != V) + // Replace the combinable shuffle with the combined one, updating all users + // so that we re-evaluate the chain here. + DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); + + return true; +} + +/// \brief Try to combine x86 target specific shuffles. +static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + MVT VT = N.getSimpleValueType(); + SmallVector<int, 4> Mask; + + switch (N.getOpcode()) { + case X86ISD::PSHUFD: + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + Mask = getPSHUFShuffleMask(N); + assert(Mask.size() == 4); + break; + case X86ISD::UNPCKL: { + // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in + // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE + // moves upper half elements into the lower half part. For example: + // + // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1, + // undef:v16i8 + // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2 + // + // will be combined to: + // + // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1 + + // This is only for 128-bit vectors. From SSE4.1 onward this combine may not + // happen due to advanced instructions. + if (!VT.is128BitVector()) + return SDValue(); + + auto Op0 = N.getOperand(0); + auto Op1 = N.getOperand(1); + if (Op0.getOpcode() == ISD::UNDEF && + Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) { + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask(); + + unsigned NumElts = VT.getVectorNumElements(); + SmallVector<int, 8> ExpectedMask(NumElts, -1); + std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2, + NumElts / 2); + + auto ShufOp = Op1.getOperand(0); + if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask)) + return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp); + } + return SDValue(); + } + case X86ISD::BLENDI: { + SDValue V0 = N->getOperand(0); + SDValue V1 = N->getOperand(1); + assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() && + "Unexpected input vector types"); + + // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector + // operands and changing the mask to 1. This saves us a bunch of + // pattern-matching possibilities related to scalar math ops in SSE/AVX. + // x86InstrInfo knows how to commute this back after instruction selection + // if it would help register allocation. + + // TODO: If optimizing for size or a processor that doesn't suffer from + // partial register update stalls, this should be transformed into a MOVSD + // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD. + + if (VT == MVT::v2f64) + if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2))) + if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) { + SDValue NewMask = DAG.getConstant(1, DL, MVT::i8); + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); + } + + return SDValue(); + } + default: + return SDValue(); + } + + // Nuke no-op shuffles that show up after combining. + if (isNoopShuffleMask(Mask)) + return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); + + // Look for simplifications involving one or two shuffle instructions. + SDValue V = N.getOperand(0); + switch (N.getOpcode()) { + default: + break; + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!"); + + if (combineRedundantHalfShuffle(N, Mask, DAG, DCI)) + return SDValue(); // We combined away this shuffle, so we're done. + + // See if this reduces to a PSHUFD which is no more expensive and can + // combine with more operations. Note that it has to at least flip the + // dwords as otherwise it would have been removed as a no-op. + if (makeArrayRef(Mask).equals({2, 3, 0, 1})) { + int DMask[] = {0, 1, 2, 3}; + int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; + DMask[DOffset + 0] = DOffset + 1; + DMask[DOffset + 1] = DOffset + 0; + MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); + V = DAG.getBitcast(DVT, V); + DCI.AddToWorklist(V.getNode()); + V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V, + getV4X86ShuffleImm8ForMask(DMask, DL, DAG)); + DCI.AddToWorklist(V.getNode()); + return DAG.getBitcast(VT, V); + } + + // Look for shuffle patterns which can be implemented as a single unpack. + // FIXME: This doesn't handle the location of the PSHUFD generically, and + // only works when we have a PSHUFD followed by two half-shuffles. + if (Mask[0] == Mask[1] && Mask[2] == Mask[3] && + (V.getOpcode() == X86ISD::PSHUFLW || + V.getOpcode() == X86ISD::PSHUFHW) && + V.getOpcode() != N.getOpcode() && + V.hasOneUse()) { + SDValue D = V.getOperand(0); + while (D.getOpcode() == ISD::BITCAST && D.hasOneUse()) + D = D.getOperand(0); + if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) { + SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); + SmallVector<int, 4> DMask = getPSHUFShuffleMask(D); + int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; + int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; + int WordMask[8]; + for (int i = 0; i < 4; ++i) { + WordMask[i + NOffset] = Mask[i] + NOffset; + WordMask[i + VOffset] = VMask[i] + VOffset; + } + // Map the word mask through the DWord mask. + int MappedMask[8]; + for (int i = 0; i < 8; ++i) + MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2; + if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) || + makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) { + // We can replace all three shuffles with an unpack. + V = DAG.getBitcast(VT, D.getOperand(0)); + DCI.AddToWorklist(V.getNode()); + return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL + : X86ISD::UNPCKH, + DL, VT, V, V); + } + } + } + + break; + + case X86ISD::PSHUFD: + if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI)) + return NewN; + + break; + } + + return SDValue(); +} + +/// \brief Try to combine a shuffle into a target-specific add-sub node. +/// +/// We combine this directly on the abstract vector shuffle nodes so it is +/// easier to generically match. We also insert dummy vector shuffle nodes for +/// the operands which explicitly discard the lanes which are unused by this +/// operation to try to flow through the rest of the combiner the fact that +/// they're unused. +static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && + (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) + return SDValue(); + + // We only handle target-independent shuffles. + // FIXME: It would be easy and harmless to use the target shuffle mask + // extraction tool to support more. + if (N->getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + + auto *SVN = cast<ShuffleVectorSDNode>(N); + SmallVector<int, 8> Mask; + for (int M : SVN->getMask()) + Mask.push_back(M); + + SDValue V1 = N->getOperand(0); + SDValue V2 = N->getOperand(1); + + // We require the first shuffle operand to be the FSUB node, and the second to + // be the FADD node. + if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) { + ShuffleVectorSDNode::commuteMask(Mask); + std::swap(V1, V2); + } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD) + return SDValue(); + + // If there are other uses of these operations we can't fold them. + if (!V1->hasOneUse() || !V2->hasOneUse()) + return SDValue(); + + // Ensure that both operations have the same operands. Note that we can + // commute the FADD operands. + SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1); + if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) && + (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS)) + return SDValue(); + + // We're looking for blends between FADD and FSUB nodes. We insist on these + // nodes being lined up in a specific expected pattern. + if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) || + isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) || + isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}))) + return SDValue(); + + return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS); +} + +/// PerformShuffleCombine - Performs several different shuffle combines. +static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDLoc dl(N); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + + // Don't create instructions with illegal types after legalize types has run. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) + return SDValue(); + + // If we have legalized the vector types, look for blends of FADD and FSUB + // nodes that we can fuse into an ADDSUB node. + if (TLI.isTypeLegal(VT)) + if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG)) + return AddSub; + + // Combine 256-bit vector shuffles. This is only profitable when in AVX mode + if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() && + N->getOpcode() == ISD::VECTOR_SHUFFLE) + return PerformShuffleCombine256(N, DAG, DCI, Subtarget); + + // During Type Legalization, when promoting illegal vector types, + // the backend might introduce new shuffle dag nodes and bitcasts. + // + // This code performs the following transformation: + // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) -> + // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>) + // + // We do this only if both the bitcast and the BINOP dag nodes have + // one use. Also, perform this transformation only if the new binary + // operation is legal. This is to avoid introducing dag nodes that + // potentially need to be further expanded (or custom lowered) into a + // less optimal sequence of dag nodes. + if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() && + N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() && + N0.getOpcode() == ISD::BITCAST) { + SDValue BC0 = N0.getOperand(0); + EVT SVT = BC0.getValueType(); + unsigned Opcode = BC0.getOpcode(); + unsigned NumElts = VT.getVectorNumElements(); + + if (BC0.hasOneUse() && SVT.isVector() && + SVT.getVectorNumElements() * 2 == NumElts && + TLI.isOperationLegal(Opcode, VT)) { + bool CanFold = false; + switch (Opcode) { + default : break; + case ISD::ADD : + case ISD::FADD : + case ISD::SUB : + case ISD::FSUB : + case ISD::MUL : + case ISD::FMUL : + CanFold = true; + } + + unsigned SVTNumElts = SVT.getVectorNumElements(); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i) + CanFold = SVOp->getMaskElt(i) == (int)(i * 2); + for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i) + CanFold = SVOp->getMaskElt(i) < 0; + + if (CanFold) { + SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0)); + SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1)); + SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01); + return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]); + } + } + } + + // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, + // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are + // consecutive, non-overlapping, and in the right order. + SmallVector<SDValue, 16> Elts; + for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) + Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); + + if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true)) + return LD; + + if (isTargetShuffle(N->getOpcode())) { + SDValue Shuffle = + PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget); + if (Shuffle.getNode()) + return Shuffle; + + // Try recursively combining arbitrary sequences of x86 shuffle + // instructions into higher-order shuffles. We do this after combining + // specific PSHUF instruction sequences into their minimal form so that we + // can evaluate how many specialized shuffle instructions are involved in + // a particular chain. + SmallVector<int, 1> NonceMask; // Just a placeholder. + NonceMask.push_back(0); + if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask, + /*Depth*/ 1, /*HasPSHUFB*/ false, DAG, + DCI, Subtarget)) + return SDValue(); // This routine will use CombineTo to replace N. + } + + return SDValue(); +} + +/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target +/// specific shuffle of a load can be folded into a single element load. +/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but +/// shuffles have been custom lowered so we need to handle those here. +static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue InVec = N->getOperand(0); + SDValue EltNo = N->getOperand(1); + + if (!isa<ConstantSDNode>(EltNo)) + return SDValue(); + + EVT OriginalVT = InVec.getValueType(); + + if (InVec.getOpcode() == ISD::BITCAST) { + // Don't duplicate a load with other uses. + if (!InVec.hasOneUse()) + return SDValue(); + EVT BCVT = InVec.getOperand(0).getValueType(); + if (!BCVT.isVector() || + BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) + return SDValue(); + InVec = InVec.getOperand(0); + } + + EVT CurrentVT = InVec.getValueType(); + + if (!isTargetShuffle(InVec.getOpcode())) + return SDValue(); + + // Don't duplicate a load with other uses. + if (!InVec.hasOneUse()) + return SDValue(); + + SmallVector<int, 16> ShuffleMask; + bool UnaryShuffle; + if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), + ShuffleMask, UnaryShuffle)) + return SDValue(); + + // Select the input vector, guarding against out of range extract vector. + unsigned NumElems = CurrentVT.getVectorNumElements(); + int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); + int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; + SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) + : InVec.getOperand(1); + + // If inputs to shuffle are the same for both ops, then allow 2 uses + unsigned AllowedUses = InVec.getNumOperands() > 1 && + InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; + + if (LdNode.getOpcode() == ISD::BITCAST) { + // Don't duplicate a load with other uses. + if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) + return SDValue(); + + AllowedUses = 1; // only allow 1 load use if we have a bitcast + LdNode = LdNode.getOperand(0); + } + + if (!ISD::isNormalLoad(LdNode.getNode())) + return SDValue(); + + LoadSDNode *LN0 = cast<LoadSDNode>(LdNode); + + if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) + return SDValue(); + + EVT EltVT = N->getValueType(0); + // If there's a bitcast before the shuffle, check if the load type and + // alignment is valid. + unsigned Align = LN0->getAlignment(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment( + EltVT.getTypeForEVT(*DAG.getContext())); + + if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT)) + return SDValue(); + + // All checks match so transform back to vector_shuffle so that DAG combiner + // can finish the job + SDLoc dl(N); + + // Create shuffle node taking into account the case that its a unary shuffle + SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) + : InVec.getOperand(1); + Shuffle = DAG.getVectorShuffle(CurrentVT, dl, + InVec.getOperand(0), Shuffle, + &ShuffleMask[0]); + Shuffle = DAG.getBitcast(OriginalVT, Shuffle); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, + EltNo); +} + +static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + // Detect bitcasts between i32 to x86mmx low word. Since MMX types are + // special and don't usually play with other vector types, it's better to + // handle them early to be sure we emit efficient code by avoiding + // store-load conversions. + if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR && + N0.getValueType() == MVT::v2i32 && + isNullConstant(N0.getOperand(1))) { + SDValue N00 = N0->getOperand(0); + if (N00.getValueType() == MVT::i32) + return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00); + } + + // Convert a bitcasted integer logic operation that has one bitcasted + // floating-point operand and one constant operand into a floating-point + // logic operation. This may create a load of the constant, but that is + // cheaper than materializing the constant in an integer register and + // transferring it to an SSE register or transferring the SSE operand to + // integer register and back. + unsigned FPOpcode; + switch (N0.getOpcode()) { + case ISD::AND: FPOpcode = X86ISD::FAND; break; + case ISD::OR: FPOpcode = X86ISD::FOR; break; + case ISD::XOR: FPOpcode = X86ISD::FXOR; break; + default: return SDValue(); + } + if (((Subtarget->hasSSE1() && VT == MVT::f32) || + (Subtarget->hasSSE2() && VT == MVT::f64)) && + isa<ConstantSDNode>(N0.getOperand(1)) && + N0.getOperand(0).getOpcode() == ISD::BITCAST && + N0.getOperand(0).getOperand(0).getValueType() == VT) { + SDValue N000 = N0.getOperand(0).getOperand(0); + SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1)); + return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst); + } + + return SDValue(); +} + +/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index +/// generation and convert it from being a bunch of shuffles and extracts +/// into a somewhat faster sequence. For i686, the best sequence is apparently +/// storing the value and loading scalars back, while for x64 we should +/// use 64-bit extracts and shifts. +static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) + return NewOp; + + SDValue InputVector = N->getOperand(0); + SDLoc dl(InputVector); + // Detect mmx to i32 conversion through a v2i32 elt extract. + if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && + N->getValueType(0) == MVT::i32 && + InputVector.getValueType() == MVT::v2i32) { + + // The bitcast source is a direct mmx result. + SDValue MMXSrc = InputVector.getNode()->getOperand(0); + if (MMXSrc.getValueType() == MVT::x86mmx) + return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), + N->getValueType(0), + InputVector.getNode()->getOperand(0)); + + // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))). + if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() && + MMXSrc.getValueType() == MVT::i64) { + SDValue MMXSrcOp = MMXSrc.getOperand(0); + if (MMXSrcOp.hasOneUse() && MMXSrcOp.getOpcode() == ISD::BITCAST && + MMXSrcOp.getValueType() == MVT::v1i64 && + MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx) + return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), + N->getValueType(0), MMXSrcOp.getOperand(0)); + } + } + + EVT VT = N->getValueType(0); + + if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) && + InputVector.getOpcode() == ISD::BITCAST && + isa<ConstantSDNode>(InputVector.getOperand(0))) { + uint64_t ExtractedElt = + cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + uint64_t InputValue = + cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue(); + uint64_t Res = (InputValue >> ExtractedElt) & 1; + return DAG.getConstant(Res, dl, MVT::i1); + } + // Only operate on vectors of 4 elements, where the alternative shuffling + // gets to be more expensive. + if (InputVector.getValueType() != MVT::v4i32) + return SDValue(); + + // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a + // single use which is a sign-extend or zero-extend, and all elements are + // used. + SmallVector<SDNode *, 4> Uses; + unsigned ExtractedElements = 0; + for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), + UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { + if (UI.getUse().getResNo() != InputVector.getResNo()) + return SDValue(); + + SDNode *Extract = *UI; + if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + if (Extract->getValueType(0) != MVT::i32) + return SDValue(); + if (!Extract->hasOneUse()) + return SDValue(); + if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && + Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) + return SDValue(); + if (!isa<ConstantSDNode>(Extract->getOperand(1))) + return SDValue(); + + // Record which element was extracted. + ExtractedElements |= + 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); + + Uses.push_back(Extract); + } + + // If not all the elements were used, this may not be worthwhile. + if (ExtractedElements != 15) + return SDValue(); + + // Ok, we've now decided to do the transformation. + // If 64-bit shifts are legal, use the extract-shift sequence, + // otherwise bounce the vector off the cache. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Vals[4]; + + if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) { + SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector); + auto &DL = DAG.getDataLayout(); + EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL); + SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, + DAG.getConstant(0, dl, VecIdxTy)); + SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, + DAG.getConstant(1, dl, VecIdxTy)); + + SDValue ShAmt = DAG.getConstant( + 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL)); + Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf); + Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, + DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt)); + Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf); + Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, + DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt)); + } else { + // Store the value to a temporary stack slot. + SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); + SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, + MachinePointerInfo(), false, false, 0); + + EVT ElementType = InputVector.getValueType().getVectorElementType(); + unsigned EltSize = ElementType.getSizeInBits() / 8; + + // Replace each use (extract) with a load of the appropriate element. + for (unsigned i = 0; i < 4; ++i) { + uint64_t Offset = EltSize * i; + auto PtrVT = TLI.getPointerTy(DAG.getDataLayout()); + SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT); + + SDValue ScalarAddr = + DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal); + + // Load the scalar. + Vals[i] = DAG.getLoad(ElementType, dl, Ch, + ScalarAddr, MachinePointerInfo(), + false, false, false, 0); + + } + } + + // Replace the extracts + for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), + UE = Uses.end(); UI != UE; ++UI) { + SDNode *Extract = *UI; + + SDValue Idx = Extract->getOperand(1); + uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]); + } + + // The replacement was made in place; don't return anything. + return SDValue(); +} + +static SDValue +transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc dl(N); + SDValue Cond = N->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + + if (Cond.getOpcode() == ISD::SIGN_EXTEND) { + SDValue CondSrc = Cond->getOperand(0); + if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG) + Cond = CondSrc->getOperand(0); + } + + if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) + return SDValue(); + + // A vselect where all conditions and data are constants can be optimized into + // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). + if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) && + ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) + return SDValue(); + + unsigned MaskValue = 0; + if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue)) + return SDValue(); + + MVT VT = N->getSimpleValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + SmallVector<int, 8> ShuffleMask(NumElems, -1); + for (unsigned i = 0; i < NumElems; ++i) { + // Be sure we emit undef where we can. + if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF) + ShuffleMask[i] = -1; + else + ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1); + } + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isShuffleMaskLegal(ShuffleMask, VT)) + return SDValue(); + return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]); +} + +/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT +/// nodes. +static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + SDValue Cond = N->getOperand(0); + // Get the LHS/RHS of the select. + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + EVT VT = LHS.getValueType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // If we have SSE[12] support, try to form min/max nodes. SSE min/max + // instructions match the semantics of the common C idiom x<y?x:y but not + // x<=y?x:y, because of how they handle negative zero (which can be + // ignored in unsafe-math mode). + // We also try to create v2f32 min/max nodes, which we later widen to v4f32. + if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && + VT != MVT::f80 && VT != MVT::f128 && + (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && + (Subtarget->hasSSE2() || + (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + unsigned Opcode = 0; + // Check for x CC y ? x : y. + if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && + DAG.isEqualTo(RHS, Cond.getOperand(1))) { + switch (CC) { + default: break; + case ISD::SETULT: + // Converting this to a min would handle NaNs incorrectly, and swapping + // the operands would cause it to handle comparisons between positive + // and negative zero incorrectly. + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { + if (!DAG.getTarget().Options.UnsafeFPMath && + !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) + break; + std::swap(LHS, RHS); + } + Opcode = X86ISD::FMIN; + break; + case ISD::SETOLE: + // Converting this to a min would handle comparisons between positive + // and negative zero incorrectly. + if (!DAG.getTarget().Options.UnsafeFPMath && + !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) + break; + Opcode = X86ISD::FMIN; + break; + case ISD::SETULE: + // Converting this to a min would handle both negative zeros and NaNs + // incorrectly, but we can swap the operands to fix both. + std::swap(LHS, RHS); + case ISD::SETOLT: + case ISD::SETLT: + case ISD::SETLE: + Opcode = X86ISD::FMIN; + break; + + case ISD::SETOGE: + // Converting this to a max would handle comparisons between positive + // and negative zero incorrectly. + if (!DAG.getTarget().Options.UnsafeFPMath && + !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) + break; + Opcode = X86ISD::FMAX; + break; + case ISD::SETUGT: + // Converting this to a max would handle NaNs incorrectly, and swapping + // the operands would cause it to handle comparisons between positive + // and negative zero incorrectly. + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { + if (!DAG.getTarget().Options.UnsafeFPMath && + !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) + break; + std::swap(LHS, RHS); + } + Opcode = X86ISD::FMAX; + break; + case ISD::SETUGE: + // Converting this to a max would handle both negative zeros and NaNs + // incorrectly, but we can swap the operands to fix both. + std::swap(LHS, RHS); + case ISD::SETOGT: + case ISD::SETGT: + case ISD::SETGE: + Opcode = X86ISD::FMAX; + break; + } + // Check for x CC y ? y : x -- a min/max with reversed arms. + } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && + DAG.isEqualTo(RHS, Cond.getOperand(0))) { + switch (CC) { + default: break; + case ISD::SETOGE: + // Converting this to a min would handle comparisons between positive + // and negative zero incorrectly, and swapping the operands would + // cause it to handle NaNs incorrectly. + if (!DAG.getTarget().Options.UnsafeFPMath && + !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) + break; + std::swap(LHS, RHS); + } + Opcode = X86ISD::FMIN; + break; + case ISD::SETUGT: + // Converting this to a min would handle NaNs incorrectly. + if (!DAG.getTarget().Options.UnsafeFPMath && + (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) + break; + Opcode = X86ISD::FMIN; + break; + case ISD::SETUGE: + // Converting this to a min would handle both negative zeros and NaNs + // incorrectly, but we can swap the operands to fix both. + std::swap(LHS, RHS); + case ISD::SETOGT: + case ISD::SETGT: + case ISD::SETGE: + Opcode = X86ISD::FMIN; + break; + + case ISD::SETULT: + // Converting this to a max would handle NaNs incorrectly. + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) + break; + Opcode = X86ISD::FMAX; + break; + case ISD::SETOLE: + // Converting this to a max would handle comparisons between positive + // and negative zero incorrectly, and swapping the operands would + // cause it to handle NaNs incorrectly. + if (!DAG.getTarget().Options.UnsafeFPMath && + !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) + break; + std::swap(LHS, RHS); + } + Opcode = X86ISD::FMAX; + break; + case ISD::SETULE: + // Converting this to a max would handle both negative zeros and NaNs + // incorrectly, but we can swap the operands to fix both. + std::swap(LHS, RHS); + case ISD::SETOLT: + case ISD::SETLT: + case ISD::SETLE: + Opcode = X86ISD::FMAX; + break; + } + } + + if (Opcode) + return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); + } + + EVT CondVT = Cond.getValueType(); + if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() && + CondVT.getVectorElementType() == MVT::i1) { + // v16i8 (select v16i1, v16i8, v16i8) does not have a proper + // lowering on KNL. In this case we convert it to + // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. + // The same situation for all 128 and 256-bit vectors of i8 and i16. + // Since SKX these selects have a proper lowering. + EVT OpVT = LHS.getValueType(); + if ((OpVT.is128BitVector() || OpVT.is256BitVector()) && + (OpVT.getVectorElementType() == MVT::i8 || + OpVT.getVectorElementType() == MVT::i16) && + !(Subtarget->hasBWI() && Subtarget->hasVLX())) { + Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond); + DCI.AddToWorklist(Cond.getNode()); + return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS); + } + } + // If this is a select between two integer constants, try to do some + // optimizations. + if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { + if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) + // Don't do this for crazy integer types. + if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { + // If this is efficiently invertible, canonicalize the LHSC/RHSC values + // so that TrueC (the true value) is larger than FalseC. + bool NeedsCondInvert = false; + + if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && + // Efficiently invertible. + (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. + (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. + isa<ConstantSDNode>(Cond.getOperand(1))))) { + NeedsCondInvert = true; + std::swap(TrueC, FalseC); + } + + // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. + if (FalseC->getAPIntValue() == 0 && + TrueC->getAPIntValue().isPowerOf2()) { + if (NeedsCondInvert) // Invert the condition if needed. + Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(1, DL, Cond.getValueType())); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); + + unsigned ShAmt = TrueC->getAPIntValue().logBase2(); + return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, + DAG.getConstant(ShAmt, DL, MVT::i8)); + } + + // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. + if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { + if (NeedsCondInvert) // Invert the condition if needed. + Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(1, DL, Cond.getValueType())); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, + FalseC->getValueType(0), Cond); + return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + } + + // Optimize cases that will turn into an LEA instruction. This requires + // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). + if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { + uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); + if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; + + bool isFastMultiplier = false; + if (Diff < 10) { + switch ((unsigned char)Diff) { + default: break; + case 1: // result = add base, cond + case 2: // result = lea base( , cond*2) + case 3: // result = lea base(cond, cond*2) + case 4: // result = lea base( , cond*4) + case 5: // result = lea base(cond, cond*4) + case 8: // result = lea base( , cond*8) + case 9: // result = lea base(cond, cond*8) + isFastMultiplier = true; + break; + } + } + + if (isFastMultiplier) { + APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); + if (NeedsCondInvert) // Invert the condition if needed. + Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(1, DL, Cond.getValueType())); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), + Cond); + // Scale the condition by the difference. + if (Diff != 1) + Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, + DAG.getConstant(Diff, DL, + Cond.getValueType())); + + // Add the base if non-zero. + if (FalseC->getAPIntValue() != 0) + Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + return Cond; + } + } + } + } + + // Canonicalize max and min: + // (x > y) ? x : y -> (x >= y) ? x : y + // (x < y) ? x : y -> (x <= y) ? x : y + // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates + // the need for an extra compare + // against zero. e.g. + // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0 + // subl %esi, %edi + // testl %edi, %edi + // movl $0, %eax + // cmovgl %edi, %eax + // => + // xorl %eax, %eax + // subl %esi, $edi + // cmovsl %eax, %edi + if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && + DAG.isEqualTo(LHS, Cond.getOperand(0)) && + DAG.isEqualTo(RHS, Cond.getOperand(1))) { + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + switch (CC) { + default: break; + case ISD::SETLT: + case ISD::SETGT: { + ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; + Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), + Cond.getOperand(0), Cond.getOperand(1), NewCC); + return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); + } + } + } + + // Early exit check + if (!TLI.isTypeLegal(VT)) + return SDValue(); + + // Match VSELECTs into subs with unsigned saturation. + if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && + // psubus is available in SSE2 and AVX2 for i8 and i16 vectors. + ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) || + (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) { + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + // Check if one of the arms of the VSELECT is a zero vector. If it's on the + // left side invert the predicate to simplify logic below. + SDValue Other; + if (ISD::isBuildVectorAllZeros(LHS.getNode())) { + Other = RHS; + CC = ISD::getSetCCInverse(CC, true); + } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) { + Other = LHS; + } + + if (Other.getNode() && Other->getNumOperands() == 2 && + DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) { + SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1); + SDValue CondRHS = Cond->getOperand(1); + + // Look for a general sub with unsigned saturation first. + // x >= y ? x-y : 0 --> subus x, y + // x > y ? x-y : 0 --> subus x, y + if ((CC == ISD::SETUGE || CC == ISD::SETUGT) && + Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS)) + return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS); + + if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) + if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) { + if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS)) + if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode()) + // If the RHS is a constant we have to reverse the const + // canonicalization. + // x > C-1 ? x+-C : 0 --> subus x, C + if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD && + CondRHSConst->getAPIntValue() == + (-OpRHSConst->getAPIntValue() - 1)) + return DAG.getNode( + X86ISD::SUBUS, DL, VT, OpLHS, + DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT)); + + // Another special case: If C was a sign bit, the sub has been + // canonicalized into a xor. + // FIXME: Would it be better to use computeKnownBits to determine + // whether it's safe to decanonicalize the xor? + // x s< 0 ? x^C : 0 --> subus x, C + if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR && + ISD::isBuildVectorAllZeros(CondRHS.getNode()) && + OpRHSConst->getAPIntValue().isSignBit()) + // Note that we have to rebuild the RHS constant here to ensure we + // don't rely on particular values of undef lanes. + return DAG.getNode( + X86ISD::SUBUS, DL, VT, OpLHS, + DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT)); + } + } + } + + // Simplify vector selection if condition value type matches vselect + // operand type + if (N->getOpcode() == ISD::VSELECT && CondVT == VT) { + assert(Cond.getValueType().isVector() && + "vector select expects a vector selector!"); + + bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); + bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); + + // Try invert the condition if true value is not all 1s and false value + // is not all 0s. + if (!TValIsAllOnes && !FValIsAllZeros && + // Check if the selector will be produced by CMPP*/PCMP* + Cond.getOpcode() == ISD::SETCC && + // Check if SETCC has already been promoted + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) == + CondVT) { + bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); + bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); + + if (TValIsAllZeros || FValIsAllOnes) { + SDValue CC = Cond.getOperand(2); + ISD::CondCode NewCC = + ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), + Cond.getOperand(0).getValueType().isInteger()); + Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); + std::swap(LHS, RHS); + TValIsAllOnes = FValIsAllOnes; + FValIsAllZeros = TValIsAllZeros; + } + } + + if (TValIsAllOnes || FValIsAllZeros) { + SDValue Ret; + + if (TValIsAllOnes && FValIsAllZeros) + Ret = Cond; + else if (TValIsAllOnes) + Ret = + DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS)); + else if (FValIsAllZeros) + Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond, + DAG.getBitcast(CondVT, LHS)); + + return DAG.getBitcast(VT, Ret); + } + } + + // We should generate an X86ISD::BLENDI from a vselect if its argument + // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of + // constants. This specific pattern gets generated when we split a + // selector for a 512 bit vector in a machine without AVX512 (but with + // 256-bit vectors), during legalization: + // + // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS) + // + // Iff we find this pattern and the build_vectors are built from + // constants, we translate the vselect into a shuffle_vector that we + // know will be matched by LowerVECTOR_SHUFFLEtoBlend. + if ((N->getOpcode() == ISD::VSELECT || + N->getOpcode() == X86ISD::SHRUNKBLEND) && + !DCI.isBeforeLegalize() && !VT.is512BitVector()) { + SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); + if (Shuffle.getNode()) + return Shuffle; + } + + // If this is a *dynamic* select (non-constant condition) and we can match + // this node with one of the variable blend instructions, restructure the + // condition so that the blends can use the high bit of each element and use + // SimplifyDemandedBits to simplify the condition operand. + if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && + !DCI.isBeforeLegalize() && + !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { + unsigned BitWidth = Cond.getValueType().getScalarSizeInBits(); + + // Don't optimize vector selects that map to mask-registers. + if (BitWidth == 1) + return SDValue(); + + // We can only handle the cases where VSELECT is directly legal on the + // subtarget. We custom lower VSELECT nodes with constant conditions and + // this makes it hard to see whether a dynamic VSELECT will correctly + // lower, so we both check the operation's status and explicitly handle the + // cases where a *dynamic* blend will fail even though a constant-condition + // blend could be custom lowered. + // FIXME: We should find a better way to handle this class of problems. + // Potentially, we should combine constant-condition vselect nodes + // pre-legalization into shuffles and not mark as many types as custom + // lowered. + if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) + return SDValue(); + // FIXME: We don't support i16-element blends currently. We could and + // should support them by making *all* the bits in the condition be set + // rather than just the high bit and using an i8-element blend. + if (VT.getVectorElementType() == MVT::i16) + return SDValue(); + // Dynamic blending was only available from SSE4.1 onward. + if (VT.is128BitVector() && !Subtarget->hasSSE41()) + return SDValue(); + // Byte blends are only available in AVX2 + if (VT == MVT::v32i8 && !Subtarget->hasAVX2()) + return SDValue(); + + assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); + APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); + + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), + DCI.isBeforeLegalizeOps()); + if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || + TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, + TLO)) { + // If we changed the computation somewhere in the DAG, this change + // will affect all users of Cond. + // Make sure it is fine and update all the nodes so that we do not + // use the generic VSELECT anymore. Otherwise, we may perform + // wrong optimizations as we messed up with the actual expectation + // for the vector boolean values. + if (Cond != TLO.Old) { + // Check all uses of that condition operand to check whether it will be + // consumed by non-BLEND instructions, which may depend on all bits are + // set properly. + for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); + I != E; ++I) + if (I->getOpcode() != ISD::VSELECT) + // TODO: Add other opcodes eventually lowered into BLEND. + return SDValue(); + + // Update all the users of the condition, before committing the change, + // so that the VSELECT optimizations that expect the correct vector + // boolean value will not be triggered. + for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); + I != E; ++I) + DAG.ReplaceAllUsesOfValueWith( + SDValue(*I, 0), + DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0), + Cond, I->getOperand(1), I->getOperand(2))); + DCI.CommitTargetLoweringOpt(TLO); + return SDValue(); + } + // At this point, only Cond is changed. Change the condition + // just for N to keep the opportunity to optimize all other + // users their own way. + DAG.ReplaceAllUsesOfValueWith( + SDValue(N, 0), + DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0), + TLO.New, N->getOperand(1), N->getOperand(2))); + return SDValue(); + } + } + + return SDValue(); +} + +// Check whether a boolean test is testing a boolean value generated by +// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition +// code. +// +// Simplify the following patterns: +// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or +// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ) +// to (Op EFLAGS Cond) +// +// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or +// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ) +// to (Op EFLAGS !Cond) +// +// where Op could be BRCOND or CMOV. +// +static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { + // Quit if not CMP and SUB with its value result used. + if (Cmp.getOpcode() != X86ISD::CMP && + (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0))) + return SDValue(); + + // Quit if not used as a boolean value. + if (CC != X86::COND_E && CC != X86::COND_NE) + return SDValue(); + + // Check CMP operands. One of them should be 0 or 1 and the other should be + // an SetCC or extended from it. + SDValue Op1 = Cmp.getOperand(0); + SDValue Op2 = Cmp.getOperand(1); + + SDValue SetCC; + const ConstantSDNode* C = nullptr; + bool needOppositeCond = (CC == X86::COND_E); + bool checkAgainstTrue = false; // Is it a comparison against 1? + + if ((C = dyn_cast<ConstantSDNode>(Op1))) + SetCC = Op2; + else if ((C = dyn_cast<ConstantSDNode>(Op2))) + SetCC = Op1; + else // Quit if all operands are not constants. + return SDValue(); + + if (C->getZExtValue() == 1) { + needOppositeCond = !needOppositeCond; + checkAgainstTrue = true; + } else if (C->getZExtValue() != 0) + // Quit if the constant is neither 0 or 1. + return SDValue(); + + bool truncatedToBoolWithAnd = false; + // Skip (zext $x), (trunc $x), or (and $x, 1) node. + while (SetCC.getOpcode() == ISD::ZERO_EXTEND || + SetCC.getOpcode() == ISD::TRUNCATE || + SetCC.getOpcode() == ISD::AND) { + if (SetCC.getOpcode() == ISD::AND) { + int OpIdx = -1; + if (isOneConstant(SetCC.getOperand(0))) + OpIdx = 1; + if (isOneConstant(SetCC.getOperand(1))) + OpIdx = 0; + if (OpIdx == -1) + break; + SetCC = SetCC.getOperand(OpIdx); + truncatedToBoolWithAnd = true; + } else + SetCC = SetCC.getOperand(0); + } + + switch (SetCC.getOpcode()) { + case X86ISD::SETCC_CARRY: + // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to + // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1, + // i.e. it's a comparison against true but the result of SETCC_CARRY is not + // truncated to i1 using 'and'. + if (checkAgainstTrue && !truncatedToBoolWithAnd) + break; + assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && + "Invalid use of SETCC_CARRY!"); + // FALL THROUGH + case X86ISD::SETCC: + // Set the condition code or opposite one if necessary. + CC = X86::CondCode(SetCC.getConstantOperandVal(0)); + if (needOppositeCond) + CC = X86::GetOppositeBranchCondition(CC); + return SetCC.getOperand(1); + case X86ISD::CMOV: { + // Check whether false/true value has canonical one, i.e. 0 or 1. + ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0)); + ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1)); + // Quit if true value is not a constant. + if (!TVal) + return SDValue(); + // Quit if false value is not a constant. + if (!FVal) { + SDValue Op = SetCC.getOperand(0); + // Skip 'zext' or 'trunc' node. + if (Op.getOpcode() == ISD::ZERO_EXTEND || + Op.getOpcode() == ISD::TRUNCATE) + Op = Op.getOperand(0); + // A special case for rdrand/rdseed, where 0 is set if false cond is + // found. + if ((Op.getOpcode() != X86ISD::RDRAND && + Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0) + return SDValue(); + } + // Quit if false value is not the constant 0 or 1. + bool FValIsFalse = true; + if (FVal && FVal->getZExtValue() != 0) { + if (FVal->getZExtValue() != 1) + return SDValue(); + // If FVal is 1, opposite cond is needed. + needOppositeCond = !needOppositeCond; + FValIsFalse = false; + } + // Quit if TVal is not the constant opposite of FVal. + if (FValIsFalse && TVal->getZExtValue() != 1) + return SDValue(); + if (!FValIsFalse && TVal->getZExtValue() != 0) + return SDValue(); + CC = X86::CondCode(SetCC.getConstantOperandVal(2)); + if (needOppositeCond) + CC = X86::GetOppositeBranchCondition(CC); + return SetCC.getOperand(3); + } + } + + return SDValue(); +} + +/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS. +/// Match: +/// (X86or (X86setcc) (X86setcc)) +/// (X86cmp (and (X86setcc) (X86setcc)), 0) +static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, + X86::CondCode &CC1, SDValue &Flags, + bool &isAnd) { + if (Cond->getOpcode() == X86ISD::CMP) { + if (!isNullConstant(Cond->getOperand(1))) + return false; + + Cond = Cond->getOperand(0); + } + + isAnd = false; + + SDValue SetCC0, SetCC1; + switch (Cond->getOpcode()) { + default: return false; + case ISD::AND: + case X86ISD::AND: + isAnd = true; + // fallthru + case ISD::OR: + case X86ISD::OR: + SetCC0 = Cond->getOperand(0); + SetCC1 = Cond->getOperand(1); + break; + }; + + // Make sure we have SETCC nodes, using the same flags value. + if (SetCC0.getOpcode() != X86ISD::SETCC || + SetCC1.getOpcode() != X86ISD::SETCC || + SetCC0->getOperand(1) != SetCC1->getOperand(1)) + return false; + + CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0); + CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0); + Flags = SetCC0->getOperand(1); + return true; +} + +/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] +static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + + // If the flag operand isn't dead, don't touch this CMOV. + if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) + return SDValue(); + + SDValue FalseOp = N->getOperand(0); + SDValue TrueOp = N->getOperand(1); + X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); + SDValue Cond = N->getOperand(3); + + if (CC == X86::COND_E || CC == X86::COND_NE) { + switch (Cond.getOpcode()) { + default: break; + case X86ISD::BSR: + case X86ISD::BSF: + // If operand of BSR / BSF are proven never zero, then ZF cannot be set. + if (DAG.isKnownNeverZero(Cond.getOperand(0))) + return (CC == X86::COND_E) ? FalseOp : TrueOp; + } + } + + SDValue Flags; + + Flags = checkBoolTestSetCCCombine(Cond, CC); + if (Flags.getNode() && + // Extra check as FCMOV only supports a subset of X86 cond. + (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { + SDValue Ops[] = { FalseOp, TrueOp, + DAG.getConstant(CC, DL, MVT::i8), Flags }; + return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); + } + + // If this is a select between two integer constants, try to do some + // optimizations. Note that the operands are ordered the opposite of SELECT + // operands. + if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { + if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { + // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is + // larger than FalseC (the false value). + if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { + CC = X86::GetOppositeBranchCondition(CC); + std::swap(TrueC, FalseC); + std::swap(TrueOp, FalseOp); + } + + // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. + // This is efficient for any integer data type (including i8/i16) and + // shift amount. + if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { + Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, DL, MVT::i8), Cond); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); + + unsigned ShAmt = TrueC->getAPIntValue().logBase2(); + Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, + DAG.getConstant(ShAmt, DL, MVT::i8)); + if (N->getNumValues() == 2) // Dead flag value? + return DCI.CombineTo(N, Cond, SDValue()); + return Cond; + } + + // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient + // for any integer data type, including i8/i16. + if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { + Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, DL, MVT::i8), Cond); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, + FalseC->getValueType(0), Cond); + Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + + if (N->getNumValues() == 2) // Dead flag value? + return DCI.CombineTo(N, Cond, SDValue()); + return Cond; + } + + // Optimize cases that will turn into an LEA instruction. This requires + // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). + if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { + uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); + if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; + + bool isFastMultiplier = false; + if (Diff < 10) { + switch ((unsigned char)Diff) { + default: break; + case 1: // result = add base, cond + case 2: // result = lea base( , cond*2) + case 3: // result = lea base(cond, cond*2) + case 4: // result = lea base( , cond*4) + case 5: // result = lea base(cond, cond*4) + case 8: // result = lea base( , cond*8) + case 9: // result = lea base(cond, cond*8) + isFastMultiplier = true; + break; + } + } + + if (isFastMultiplier) { + APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); + Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, DL, MVT::i8), Cond); + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), + Cond); + // Scale the condition by the difference. + if (Diff != 1) + Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, + DAG.getConstant(Diff, DL, Cond.getValueType())); + + // Add the base if non-zero. + if (FalseC->getAPIntValue() != 0) + Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + if (N->getNumValues() == 2) // Dead flag value? + return DCI.CombineTo(N, Cond, SDValue()); + return Cond; + } + } + } + } + + // Handle these cases: + // (select (x != c), e, c) -> select (x != c), e, x), + // (select (x == c), c, e) -> select (x == c), x, e) + // where the c is an integer constant, and the "select" is the combination + // of CMOV and CMP. + // + // The rationale for this change is that the conditional-move from a constant + // needs two instructions, however, conditional-move from a register needs + // only one instruction. + // + // CAVEAT: By replacing a constant with a symbolic value, it may obscure + // some instruction-combining opportunities. This opt needs to be + // postponed as late as possible. + // + if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) { + // the DCI.xxxx conditions are provided to postpone the optimization as + // late as possible. + + ConstantSDNode *CmpAgainst = nullptr; + if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && + (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) && + !isa<ConstantSDNode>(Cond.getOperand(0))) { + + if (CC == X86::COND_NE && + CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) { + CC = X86::GetOppositeBranchCondition(CC); + std::swap(TrueOp, FalseOp); + } + + if (CC == X86::COND_E && + CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { + SDValue Ops[] = { FalseOp, Cond.getOperand(0), + DAG.getConstant(CC, DL, MVT::i8), Cond }; + return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops); + } + } + } + + // Fold and/or of setcc's to double CMOV: + // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2) + // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2) + // + // This combine lets us generate: + // cmovcc1 (jcc1 if we don't have CMOV) + // cmovcc2 (same) + // instead of: + // setcc1 + // setcc2 + // and/or + // cmovne (jne if we don't have CMOV) + // When we can't use the CMOV instruction, it might increase branch + // mispredicts. + // When we can use CMOV, or when there is no mispredict, this improves + // throughput and reduces register pressure. + // + if (CC == X86::COND_NE) { + SDValue Flags; + X86::CondCode CC0, CC1; + bool isAndSetCC; + if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) { + if (isAndSetCC) { + std::swap(FalseOp, TrueOp); + CC0 = X86::GetOppositeBranchCondition(CC0); + CC1 = X86::GetOppositeBranchCondition(CC1); + } + + SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8), + Flags}; + SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps); + SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags}; + SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1)); + return CMOV; + } + } + + return SDValue(); +} + +/// PerformMulCombine - Optimize a single multiply with constant into two +/// in order to implement it with two cheaper instructions, e.g. +/// LEA + SHL, LEA + LEA. +static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + // An imul is usually smaller than the alternative sequence. + if (DAG.getMachineFunction().getFunction()->optForMinSize()) + return SDValue(); + + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + EVT VT = N->getValueType(0); + if (VT != MVT::i64 && VT != MVT::i32) + return SDValue(); + + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!C) + return SDValue(); + uint64_t MulAmt = C->getZExtValue(); + if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) + return SDValue(); + + uint64_t MulAmt1 = 0; + uint64_t MulAmt2 = 0; + if ((MulAmt % 9) == 0) { + MulAmt1 = 9; + MulAmt2 = MulAmt / 9; + } else if ((MulAmt % 5) == 0) { + MulAmt1 = 5; + MulAmt2 = MulAmt / 5; + } else if ((MulAmt % 3) == 0) { + MulAmt1 = 3; + MulAmt2 = MulAmt / 3; + } + + SDLoc DL(N); + SDValue NewMul; + if (MulAmt2 && + (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ + + if (isPowerOf2_64(MulAmt2) && + !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) + // If second multiplifer is pow2, issue it first. We want the multiply by + // 3, 5, or 9 to be folded into the addressing mode unless the lone use + // is an add. + std::swap(MulAmt1, MulAmt2); + + if (isPowerOf2_64(MulAmt1)) + NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8)); + else + NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), + DAG.getConstant(MulAmt1, DL, VT)); + + if (isPowerOf2_64(MulAmt2)) + NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, + DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8)); + else + NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, + DAG.getConstant(MulAmt2, DL, VT)); + } + + if (!NewMul) { + assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) + && "Both cases that could cause potential overflows should have " + "already been handled."); + if (isPowerOf2_64(MulAmt - 1)) + // (mul x, 2^N + 1) => (add (shl x, N), x) + NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(MulAmt - 1), DL, + MVT::i8))); + + else if (isPowerOf2_64(MulAmt + 1)) + // (mul x, 2^N - 1) => (sub (shl x, N), x) + NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT, + N->getOperand(0), + DAG.getConstant(Log2_64(MulAmt + 1), + DL, MVT::i8)), N->getOperand(0)); + } + + if (NewMul) + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, NewMul, false); + + return SDValue(); +} + +static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); + EVT VT = N0.getValueType(); + + // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) + // since the result of setcc_c is all zero's or all ones. + if (VT.isInteger() && !VT.isVector() && + N1C && N0.getOpcode() == ISD::AND && + N0.getOperand(1).getOpcode() == ISD::Constant) { + SDValue N00 = N0.getOperand(0); + APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); + APInt ShAmt = N1C->getAPIntValue(); + Mask = Mask.shl(ShAmt); + bool MaskOK = false; + // We can handle cases concerning bit-widening nodes containing setcc_c if + // we carefully interrogate the mask to make sure we are semantics + // preserving. + // The transform is not safe if the result of C1 << C2 exceeds the bitwidth + // of the underlying setcc_c operation if the setcc_c was zero extended. + // Consider the following example: + // zext(setcc_c) -> i32 0x0000FFFF + // c1 -> i32 0x0000FFFF + // c2 -> i32 0x00000001 + // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE + // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE + if (N00.getOpcode() == X86ISD::SETCC_CARRY) { + MaskOK = true; + } else if (N00.getOpcode() == ISD::SIGN_EXTEND && + N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { + MaskOK = true; + } else if ((N00.getOpcode() == ISD::ZERO_EXTEND || + N00.getOpcode() == ISD::ANY_EXTEND) && + N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { + MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits()); + } + if (MaskOK && Mask != 0) { + SDLoc DL(N); + return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT)); + } + } + + // Hardware support for vector shifts is sparse which makes us scalarize the + // vector operations in many cases. Also, on sandybridge ADD is faster than + // shl. + // (shl V, 1) -> add V,V + if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1)) + if (auto *N1SplatC = N1BV->getConstantSplatNode()) { + assert(N0.getValueType().isVector() && "Invalid vector shift type"); + // We shift all of the values by one. In many cases we do not have + // hardware support for this operation. This is better expressed as an ADD + // of two values. + if (N1SplatC->getAPIntValue() == 1) + return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); + } + + return SDValue(); +} + +static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + unsigned Size = VT.getSizeInBits(); + + // fold (ashr (shl, a, [56,48,32,24,16]), SarConst) + // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or + // into (lshr, (sext (a), SarConst - [56,48,32,24,16])) + // depending on sign of (SarConst - [56,48,32,24,16]) + + // sexts in X86 are MOVs. The MOVs have the same code size + // as above SHIFTs (only SHIFT on 1 has lower code size). + // However the MOVs have 2 advantages to a SHIFT: + // 1. MOVs can write to a register that differs from source + // 2. MOVs accept memory operands + + if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant || + N0.getOpcode() != ISD::SHL || !N0.hasOneUse() || + N0.getOperand(1).getOpcode() != ISD::Constant) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + SDValue N01 = N0.getOperand(1); + APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue(); + APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue(); + EVT CVT = N1.getValueType(); + + if (SarConst.isNegative()) + return SDValue(); + + for (MVT SVT : MVT::integer_valuetypes()) { + unsigned ShiftSize = SVT.getSizeInBits(); + // skipping types without corresponding sext/zext and + // ShlConst that is not one of [56,48,32,24,16] + if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize) + continue; + SDLoc DL(N); + SDValue NN = + DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT)); + SarConst = SarConst - (Size - ShiftSize); + if (SarConst == 0) + return NN; + else if (SarConst.isNegative()) + return DAG.getNode(ISD::SHL, DL, VT, NN, + DAG.getConstant(-SarConst, DL, CVT)); + else + return DAG.getNode(ISD::SRA, DL, VT, NN, + DAG.getConstant(SarConst, DL, CVT)); + } + return SDValue(); +} + +/// \brief Returns a vector of 0s if the node in input is a vector logical +/// shift by a constant amount which is known to be bigger than or equal +/// to the vector element size in bits. +static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + + if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && + (!Subtarget->hasInt256() || + (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) + return SDValue(); + + SDValue Amt = N->getOperand(1); + SDLoc DL(N); + if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt)) + if (auto *AmtSplat = AmtBV->getConstantSplatNode()) { + APInt ShiftAmt = AmtSplat->getAPIntValue(); + unsigned MaxAmount = + VT.getSimpleVT().getVectorElementType().getSizeInBits(); + + // SSE2/AVX2 logical shifts always return a vector of 0s + // if the shift amount is bigger than or equal to + // the element size. The constant shift amount will be + // encoded as a 8-bit immediate. + if (ShiftAmt.trunc(8).uge(MaxAmount)) + return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL); + } + + return SDValue(); +} + +/// PerformShiftCombine - Combine shifts. +static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + if (N->getOpcode() == ISD::SHL) + if (SDValue V = PerformSHLCombine(N, DAG)) + return V; + + if (N->getOpcode() == ISD::SRA) + if (SDValue V = PerformSRACombine(N, DAG)) + return V; + + // Try to fold this logical shift into a zero vector. + if (N->getOpcode() != ISD::SRA) + if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget)) + return V; + + return SDValue(); +} + +// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) +// where both setccs reference the same FP CMP, and rewrite for CMPEQSS +// and friends. Likewise for OR -> CMPNEQSS. +static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + unsigned opcode; + + // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but + // we're requiring SSE2 for both. + if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue CMP0 = N0->getOperand(1); + SDValue CMP1 = N1->getOperand(1); + SDLoc DL(N); + + // The SETCCs should both refer to the same CMP. + if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) + return SDValue(); + + SDValue CMP00 = CMP0->getOperand(0); + SDValue CMP01 = CMP0->getOperand(1); + EVT VT = CMP00.getValueType(); + + if (VT == MVT::f32 || VT == MVT::f64) { + bool ExpectingFlags = false; + // Check for any users that want flags: + for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); + !ExpectingFlags && UI != UE; ++UI) + switch (UI->getOpcode()) { + default: + case ISD::BR_CC: + case ISD::BRCOND: + case ISD::SELECT: + ExpectingFlags = true; + break; + case ISD::CopyToReg: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + break; + } + + if (!ExpectingFlags) { + enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); + enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); + + if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { + X86::CondCode tmp = cc0; + cc0 = cc1; + cc1 = tmp; + } + + if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || + (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { + // FIXME: need symbolic constants for these magic numbers. + // See X86ATTInstPrinter.cpp:printSSECC(). + unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; + if (Subtarget->hasAVX512()) { + SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00, + CMP01, + DAG.getConstant(x86cc, DL, MVT::i8)); + if (N->getValueType(0) != MVT::i1) + return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), + FSetCC); + return FSetCC; + } + SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, + CMP00.getValueType(), CMP00, CMP01, + DAG.getConstant(x86cc, DL, + MVT::i8)); + + bool is64BitFP = (CMP00.getValueType() == MVT::f64); + MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32; + + if (is64BitFP && !Subtarget->is64Bit()) { + // On a 32-bit target, we cannot bitcast the 64-bit float to a + // 64-bit integer, since that's not a legal type. Since + // OnesOrZeroesF is all ones of all zeroes, we don't need all the + // bits, but can do this little dance to extract the lowest 32 bits + // and work with those going forward. + SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, + OnesOrZeroesF); + SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64); + OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, + Vector32, DAG.getIntPtrConstant(0, DL)); + IntVT = MVT::i32; + } + + SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF); + SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI, + DAG.getConstant(1, DL, IntVT)); + SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, + ANDed); + return OneBitOfTruth; + } + } + } + } + return SDValue(); +} + +/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector +/// so it can be folded inside ANDNP. +static bool CanFoldXORWithAllOnes(const SDNode *N) { + EVT VT = N->getValueType(0); + + // Match direct AllOnes for 128 and 256-bit vectors + if (ISD::isBuildVectorAllOnes(N)) + return true; + + // Look through a bit convert. + if (N->getOpcode() == ISD::BITCAST) + N = N->getOperand(0).getNode(); + + // Sometimes the operand may come from a insert_subvector building a 256-bit + // allones vector + if (VT.is256BitVector() && + N->getOpcode() == ISD::INSERT_SUBVECTOR) { + SDValue V1 = N->getOperand(0); + SDValue V2 = N->getOperand(1); + + if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && + V1.getOperand(0).getOpcode() == ISD::UNDEF && + ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && + ISD::isBuildVectorAllOnes(V2.getNode())) + return true; + } + + return false; +} + +// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized +// register. In most cases we actually compare or select YMM-sized registers +// and mixing the two types creates horrible code. This method optimizes +// some of the transition sequences. +static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (!VT.is256BitVector()) + return SDValue(); + + assert((N->getOpcode() == ISD::ANY_EXTEND || + N->getOpcode() == ISD::ZERO_EXTEND || + N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); + + SDValue Narrow = N->getOperand(0); + EVT NarrowVT = Narrow->getValueType(0); + if (!NarrowVT.is128BitVector()) + return SDValue(); + + if (Narrow->getOpcode() != ISD::XOR && + Narrow->getOpcode() != ISD::AND && + Narrow->getOpcode() != ISD::OR) + return SDValue(); + + SDValue N0 = Narrow->getOperand(0); + SDValue N1 = Narrow->getOperand(1); + SDLoc DL(Narrow); + + // The Left side has to be a trunc. + if (N0.getOpcode() != ISD::TRUNCATE) + return SDValue(); + + // The type of the truncated inputs. + EVT WideVT = N0->getOperand(0)->getValueType(0); + if (WideVT != VT) + return SDValue(); + + // The right side has to be a 'trunc' or a constant vector. + bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; + ConstantSDNode *RHSConstSplat = nullptr; + if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1)) + RHSConstSplat = RHSBV->getConstantSplatNode(); + if (!RHSTrunc && !RHSConstSplat) + return SDValue(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT)) + return SDValue(); + + // Set N0 and N1 to hold the inputs to the new wide operation. + N0 = N0->getOperand(0); + if (RHSConstSplat) { + N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(), + SDValue(RHSConstSplat, 0)); + SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1); + N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C); + } else if (RHSTrunc) { + N1 = N1->getOperand(0); + } + + // Generate the wide operation. + SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1); + unsigned Opcode = N->getOpcode(); + switch (Opcode) { + case ISD::ANY_EXTEND: + return Op; + case ISD::ZERO_EXTEND: { + unsigned InBits = NarrowVT.getScalarSizeInBits(); + APInt Mask = APInt::getAllOnesValue(InBits); + Mask = Mask.zext(VT.getScalarSizeInBits()); + return DAG.getNode(ISD::AND, DL, VT, + Op, DAG.getConstant(Mask, DL, VT)); + } + case ISD::SIGN_EXTEND: + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, + Op, DAG.getValueType(NarrowVT)); + default: + llvm_unreachable("Unexpected opcode"); + } +} + +static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc DL(N); + + // A vector zext_in_reg may be represented as a shuffle, + // feeding into a bitcast (this represents anyext) feeding into + // an and with a mask. + // We'd like to try to combine that into a shuffle with zero + // plus a bitcast, removing the and. + if (N0.getOpcode() != ISD::BITCAST || + N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + + // The other side of the AND should be a splat of 2^C, where C + // is the number of bits in the source type. + if (N1.getOpcode() == ISD::BITCAST) + N1 = N1.getOperand(0); + if (N1.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1); + + ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0)); + EVT SrcType = Shuffle->getValueType(0); + + // We expect a single-source shuffle + if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF) + return SDValue(); + + unsigned SrcSize = SrcType.getScalarSizeInBits(); + + APInt SplatValue, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (!Vector->isConstantSplat(SplatValue, SplatUndef, + SplatBitSize, HasAnyUndefs)) + return SDValue(); + + unsigned ResSize = N1.getValueType().getScalarSizeInBits(); + // Make sure the splat matches the mask we expect + if (SplatBitSize > ResSize || + (SplatValue + 1).exactLogBase2() != (int)SrcSize) + return SDValue(); + + // Make sure the input and output size make sense + if (SrcSize >= ResSize || ResSize % SrcSize) + return SDValue(); + + // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...> + // The number of u's between each two values depends on the ratio between + // the source and dest type. + unsigned ZextRatio = ResSize / SrcSize; + bool IsZext = true; + for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) { + if (i % ZextRatio) { + if (Shuffle->getMaskElt(i) > 0) { + // Expected undef + IsZext = false; + break; + } + } else { + if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) { + // Expected element number + IsZext = false; + break; + } + } + } + + if (!IsZext) + return SDValue(); + + // Ok, perform the transformation - replace the shuffle with + // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero + // (instead of undef) where the k elements come from the zero vector. + SmallVector<int, 8> Mask; + unsigned NumElems = SrcType.getVectorNumElements(); + for (unsigned i = 0; i < NumElems; ++i) + if (i % ZextRatio) + Mask.push_back(NumElems); + else + Mask.push_back(i / ZextRatio); + + SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL, + Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask); + return DAG.getBitcast(N0.getValueType(), NewShuffle); +} + +/// If both input operands of a logic op are being cast from floating point +/// types, try to convert this into a floating point logic node to avoid +/// unnecessary moves from SSE to integer registers. +static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + unsigned FPOpcode = ISD::DELETED_NODE; + if (N->getOpcode() == ISD::AND) + FPOpcode = X86ISD::FAND; + else if (N->getOpcode() == ISD::OR) + FPOpcode = X86ISD::FOR; + else if (N->getOpcode() == ISD::XOR) + FPOpcode = X86ISD::FXOR; + + assert(FPOpcode != ISD::DELETED_NODE && + "Unexpected input node for FP logic conversion"); + + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc DL(N); + if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && + ((Subtarget->hasSSE1() && VT == MVT::i32) || + (Subtarget->hasSSE2() && VT == MVT::i64))) { + SDValue N00 = N0.getOperand(0); + SDValue N10 = N1.getOperand(0); + EVT N00Type = N00.getValueType(); + EVT N10Type = N10.getValueType(); + if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) { + SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); + return DAG.getBitcast(VT, FPLogic); + } + } + return SDValue(); +} + +static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget)) + return Zext; + + if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) + return R; + + if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) + return FPLogic; + + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc DL(N); + + // Create BEXTR instructions + // BEXTR is ((X >> imm) & (2**size-1)) + if (VT == MVT::i32 || VT == MVT::i64) { + // Check for BEXTR. + if ((Subtarget->hasBMI() || Subtarget->hasTBM()) && + (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) { + ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1); + ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1)); + if (MaskNode && ShiftNode) { + uint64_t Mask = MaskNode->getZExtValue(); + uint64_t Shift = ShiftNode->getZExtValue(); + if (isMask_64(Mask)) { + uint64_t MaskSize = countPopulation(Mask); + if (Shift + MaskSize <= VT.getSizeInBits()) + return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0), + DAG.getConstant(Shift | (MaskSize << 8), DL, + VT)); + } + } + } // BEXTR + + return SDValue(); + } + + // Want to form ANDNP nodes: + // 1) In the hopes of then easily combining them with OR and AND nodes + // to form PBLEND/PSIGN. + // 2) To match ANDN packed intrinsics + if (VT != MVT::v2i64 && VT != MVT::v4i64) + return SDValue(); + + // Check LHS for vnot + if (N0.getOpcode() == ISD::XOR && + //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) + CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) + return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); + + // Check RHS for vnot + if (N1.getOpcode() == ISD::XOR && + //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) + CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) + return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); + + return SDValue(); +} + +static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) + return R; + + if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) + return FPLogic; + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + + // look for psign/blend + if (VT == MVT::v2i64 || VT == MVT::v4i64) { + if (!Subtarget->hasSSSE3() || + (VT == MVT::v4i64 && !Subtarget->hasInt256())) + return SDValue(); + + // Canonicalize pandn to RHS + if (N0.getOpcode() == X86ISD::ANDNP) + std::swap(N0, N1); + // or (and (m, y), (pandn m, x)) + if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { + SDValue Mask = N1.getOperand(0); + SDValue X = N1.getOperand(1); + SDValue Y; + if (N0.getOperand(0) == Mask) + Y = N0.getOperand(1); + if (N0.getOperand(1) == Mask) + Y = N0.getOperand(0); + + // Check to see if the mask appeared in both the AND and ANDNP and + if (!Y.getNode()) + return SDValue(); + + // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. + // Look through mask bitcast. + if (Mask.getOpcode() == ISD::BITCAST) + Mask = Mask.getOperand(0); + if (X.getOpcode() == ISD::BITCAST) + X = X.getOperand(0); + if (Y.getOpcode() == ISD::BITCAST) + Y = Y.getOperand(0); + + EVT MaskVT = Mask.getValueType(); + + // Validate that the Mask operand is a vector sra node. + // FIXME: what to do for bytes, since there is a psignb/pblendvb, but + // there is no psrai.b + unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); + unsigned SraAmt = ~0; + if (Mask.getOpcode() == ISD::SRA) { + if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1))) + if (auto *AmtConst = AmtBV->getConstantSplatNode()) + SraAmt = AmtConst->getZExtValue(); + } else if (Mask.getOpcode() == X86ISD::VSRAI) { + SDValue SraC = Mask.getOperand(1); + SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); + } + if ((SraAmt + 1) != EltBits) + return SDValue(); + + SDLoc DL(N); + + // Now we know we at least have a plendvb with the mask val. See if + // we can form a psignb/w/d. + // psign = x.type == y.type == mask.type && y = sub(0, x); + if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && + ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && + X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { + assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && + "Unsupported VT for PSIGN"); + Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0)); + return DAG.getBitcast(VT, Mask); + } + // PBLENDVB only available on SSE 4.1 + if (!Subtarget->hasSSE41()) + return SDValue(); + + MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; + + X = DAG.getBitcast(BlendVT, X); + Y = DAG.getBitcast(BlendVT, Y); + Mask = DAG.getBitcast(BlendVT, Mask); + Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); + return DAG.getBitcast(VT, Mask); + } + } + + if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) + bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize(); + + // SHLD/SHRD instructions have lower register pressure, but on some + // platforms they have higher latency than the equivalent + // series of shifts/or that would otherwise be generated. + // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions + // have higher latencies and we are not optimizing for size. + if (!OptForSize && Subtarget->isSHLDSlow()) + return SDValue(); + + if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) + std::swap(N0, N1); + if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) + return SDValue(); + if (!N0.hasOneUse() || !N1.hasOneUse()) + return SDValue(); + + SDValue ShAmt0 = N0.getOperand(1); + if (ShAmt0.getValueType() != MVT::i8) + return SDValue(); + SDValue ShAmt1 = N1.getOperand(1); + if (ShAmt1.getValueType() != MVT::i8) + return SDValue(); + if (ShAmt0.getOpcode() == ISD::TRUNCATE) + ShAmt0 = ShAmt0.getOperand(0); + if (ShAmt1.getOpcode() == ISD::TRUNCATE) + ShAmt1 = ShAmt1.getOperand(0); + + SDLoc DL(N); + unsigned Opc = X86ISD::SHLD; + SDValue Op0 = N0.getOperand(0); + SDValue Op1 = N1.getOperand(0); + if (ShAmt0.getOpcode() == ISD::SUB) { + Opc = X86ISD::SHRD; + std::swap(Op0, Op1); + std::swap(ShAmt0, ShAmt1); + } + + unsigned Bits = VT.getSizeInBits(); + if (ShAmt1.getOpcode() == ISD::SUB) { + SDValue Sum = ShAmt1.getOperand(0); + if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { + SDValue ShAmt1Op1 = ShAmt1.getOperand(1); + if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) + ShAmt1Op1 = ShAmt1Op1.getOperand(0); + if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) + return DAG.getNode(Opc, DL, VT, + Op0, Op1, + DAG.getNode(ISD::TRUNCATE, DL, + MVT::i8, ShAmt0)); + } + } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { + ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); + if (ShAmt0C && + ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) + return DAG.getNode(Opc, DL, VT, + N0.getOperand(0), N1.getOperand(0), + DAG.getNode(ISD::TRUNCATE, DL, + MVT::i8, ShAmt0)); + } + + return SDValue(); +} + +// Generate NEG and CMOV for integer abs. +static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + + // Since X86 does not have CMOV for 8-bit integer, we don't convert + // 8-bit integer abs to NEG and CMOV. + if (VT.isInteger() && VT.getSizeInBits() == 8) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc DL(N); + + // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) + // and change it to SUB and CMOV. + if (VT.isInteger() && N->getOpcode() == ISD::XOR && + N0.getOpcode() == ISD::ADD && + N0.getOperand(1) == N1 && + N1.getOpcode() == ISD::SRA && + N1.getOperand(0) == N0.getOperand(0)) + if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) + if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) { + // Generate SUB & CMOV. + SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), + DAG.getConstant(0, DL, VT), N0.getOperand(0)); + + SDValue Ops[] = { N0.getOperand(0), Neg, + DAG.getConstant(X86::COND_GE, DL, MVT::i8), + SDValue(Neg.getNode(), 1) }; + return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops); + } + return SDValue(); +} + +// Try to turn tests against the signbit in the form of: +// XOR(TRUNCATE(SRL(X, size(X)-1)), 1) +// into: +// SETGT(X, -1) +static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) { + // This is only worth doing if the output type is i8. + if (N->getValueType(0) != MVT::i8) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // We should be performing an xor against a truncated shift. + if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse()) + return SDValue(); + + // Make sure we are performing an xor against one. + if (!isOneConstant(N1)) + return SDValue(); + + // SetCC on x86 zero extends so only act on this if it's a logical shift. + SDValue Shift = N0.getOperand(0); + if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse()) + return SDValue(); + + // Make sure we are truncating from one of i16, i32 or i64. + EVT ShiftTy = Shift.getValueType(); + if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64) + return SDValue(); + + // Make sure the shift amount extracts the sign bit. + if (!isa<ConstantSDNode>(Shift.getOperand(1)) || + Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1) + return SDValue(); + + // Create a greater-than comparison against -1. + // N.B. Using SETGE against 0 works but we want a canonical looking + // comparison, using SETGT matches up with what TranslateX86CC. + SDLoc DL(N); + SDValue ShiftOp = Shift.getOperand(0); + EVT ShiftOpTy = ShiftOp.getValueType(); + SDValue Cond = DAG.getSetCC(DL, MVT::i8, ShiftOp, + DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT); + return Cond; +} + +static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG)) + return RV; + + if (Subtarget->hasCMov()) + if (SDValue RV = performIntegerAbsCombine(N, DAG)) + return RV; + + if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) + return FPLogic; + + return SDValue(); +} + +/// This function detects the AVG pattern between vectors of unsigned i8/i16, +/// which is c = (a + b + 1) / 2, and replace this operation with the efficient +/// X86ISD::AVG instruction. +static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, + const X86Subtarget *Subtarget, SDLoc DL) { + if (!VT.isVector() || !VT.isSimple()) + return SDValue(); + EVT InVT = In.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + EVT ScalarVT = VT.getVectorElementType(); + if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && + isPowerOf2_32(NumElems))) + return SDValue(); + + // InScalarVT is the intermediate type in AVG pattern and it should be greater + // than the original input type (i8/i16). + EVT InScalarVT = InVT.getVectorElementType(); + if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits()) + return SDValue(); + + if (Subtarget->hasAVX512()) { + if (VT.getSizeInBits() > 512) + return SDValue(); + } else if (Subtarget->hasAVX2()) { + if (VT.getSizeInBits() > 256) + return SDValue(); + } else { + if (VT.getSizeInBits() > 128) + return SDValue(); + } + + // Detect the following pattern: + // + // %1 = zext <N x i8> %a to <N x i32> + // %2 = zext <N x i8> %b to <N x i32> + // %3 = add nuw nsw <N x i32> %1, <i32 1 x N> + // %4 = add nuw nsw <N x i32> %3, %2 + // %5 = lshr <N x i32> %N, <i32 1 x N> + // %6 = trunc <N x i32> %5 to <N x i8> + // + // In AVX512, the last instruction can also be a trunc store. + + if (In.getOpcode() != ISD::SRL) + return SDValue(); + + // A lambda checking the given SDValue is a constant vector and each element + // is in the range [Min, Max]. + auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) { + BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V); + if (!BV || !BV->isConstant()) + return false; + for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i)); + if (!C) + return false; + uint64_t Val = C->getZExtValue(); + if (Val < Min || Val > Max) + return false; + } + return true; + }; + + // Check if each element of the vector is left-shifted by one. + auto LHS = In.getOperand(0); + auto RHS = In.getOperand(1); + if (!IsConstVectorInRange(RHS, 1, 1)) + return SDValue(); + if (LHS.getOpcode() != ISD::ADD) + return SDValue(); + + // Detect a pattern of a + b + 1 where the order doesn't matter. + SDValue Operands[3]; + Operands[0] = LHS.getOperand(0); + Operands[1] = LHS.getOperand(1); + + // Take care of the case when one of the operands is a constant vector whose + // element is in the range [1, 256]. + if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) && + Operands[0].getOpcode() == ISD::ZERO_EXTEND && + Operands[0].getOperand(0).getValueType() == VT) { + // The pattern is detected. Subtract one from the constant vector, then + // demote it and emit X86ISD::AVG instruction. + SDValue One = DAG.getConstant(1, DL, InScalarVT); + SDValue Ones = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT, + SmallVector<SDValue, 8>(NumElems, One)); + Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], Ones); + Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]); + return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), + Operands[1]); + } + + if (Operands[0].getOpcode() == ISD::ADD) + std::swap(Operands[0], Operands[1]); + else if (Operands[1].getOpcode() != ISD::ADD) + return SDValue(); + Operands[2] = Operands[1].getOperand(0); + Operands[1] = Operands[1].getOperand(1); + + // Now we have three operands of two additions. Check that one of them is a + // constant vector with ones, and the other two are promoted from i8/i16. + for (int i = 0; i < 3; ++i) { + if (!IsConstVectorInRange(Operands[i], 1, 1)) + continue; + std::swap(Operands[i], Operands[2]); + + // Check if Operands[0] and Operands[1] are results of type promotion. + for (int j = 0; j < 2; ++j) + if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || + Operands[j].getOperand(0).getValueType() != VT) + return SDValue(); + + // The pattern is detected, emit X86ISD::AVG instruction. + return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), + Operands[1].getOperand(0)); + } + + return SDValue(); +} + +/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. +static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + LoadSDNode *Ld = cast<LoadSDNode>(N); + EVT RegVT = Ld->getValueType(0); + EVT MemVT = Ld->getMemoryVT(); + SDLoc dl(Ld); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // For chips with slow 32-byte unaligned loads, break the 32-byte operation + // into two 16-byte operations. + ISD::LoadExtType Ext = Ld->getExtensionType(); + bool Fast; + unsigned AddressSpace = Ld->getAddressSpace(); + unsigned Alignment = Ld->getAlignment(); + if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() && + Ext == ISD::NON_EXTLOAD && + TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, + AddressSpace, Alignment, &Fast) && !Fast) { + unsigned NumElems = RegVT.getVectorNumElements(); + if (NumElems < 2) + return SDValue(); + + SDValue Ptr = Ld->getBasePtr(); + SDValue Increment = + DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout())); + + EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), + NumElems/2); + SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, + Ld->getPointerInfo(), Ld->isVolatile(), + Ld->isNonTemporal(), Ld->isInvariant(), + Alignment); + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, + Ld->getPointerInfo(), Ld->isVolatile(), + Ld->isNonTemporal(), Ld->isInvariant(), + std::min(16U, Alignment)); + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + Load1.getValue(1), + Load2.getValue(1)); + + SDValue NewVec = DAG.getUNDEF(RegVT); + NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl); + NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl); + return DCI.CombineTo(N, NewVec, TF, true); + } + + return SDValue(); +} + +/// PerformMLOADCombine - Resolve extending loads +static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N); + if (Mld->getExtensionType() != ISD::SEXTLOAD) + return SDValue(); + + EVT VT = Mld->getValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + EVT LdVT = Mld->getMemoryVT(); + SDLoc dl(Mld); + + assert(LdVT != VT && "Cannot extend to the same type"); + unsigned ToSz = VT.getVectorElementType().getSizeInBits(); + unsigned FromSz = LdVT.getVectorElementType().getSizeInBits(); + // From, To sizes and ElemCount must be pow of two + assert (isPowerOf2_32(NumElems * FromSz * ToSz) && + "Unexpected size for extending masked load"); + + unsigned SizeRatio = ToSz / FromSz; + assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), + LdVT.getScalarType(), NumElems*SizeRatio); + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + // Convert Src0 value + SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0()); + if (Mld->getSrc0().getOpcode() != ISD::UNDEF) { + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + + // Can't shuffle using an illegal type. + assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && + "WideVecVT should be legal"); + WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0, + DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); + } + // Prepare the new mask + SDValue NewMask; + SDValue Mask = Mld->getMask(); + if (Mask.getValueType() == VT) { + // Mask and original value have the same type + NewMask = DAG.getBitcast(WideVecVT, Mask); + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i) + ShuffleVec[i] = NumElems * SizeRatio; + NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, + DAG.getConstant(0, dl, WideVecVT), + &ShuffleVec[0]); + } + else { + assert(Mask.getValueType().getVectorElementType() == MVT::i1); + unsigned WidenNumElts = NumElems*SizeRatio; + unsigned MaskNumElts = VT.getVectorNumElements(); + EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + WidenNumElts); + + unsigned NumConcat = WidenNumElts / MaskNumElts; + SmallVector<SDValue, 16> Ops(NumConcat); + SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); + Ops[0] = Mask; + for (unsigned i = 1; i != NumConcat; ++i) + Ops[i] = ZeroVal; + + NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); + } + + SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(), + Mld->getBasePtr(), NewMask, WideSrc0, + Mld->getMemoryVT(), Mld->getMemOperand(), + ISD::NON_EXTLOAD); + SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd); + return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); +} +/// PerformMSTORECombine - Resolve truncating stores +static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N); + if (!Mst->isTruncatingStore()) + return SDValue(); + + EVT VT = Mst->getValue().getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + EVT StVT = Mst->getMemoryVT(); + SDLoc dl(Mst); + + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromSz = VT.getVectorElementType().getSizeInBits(); + unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // The truncating store is legal in some cases. For example + // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw + // are designated for truncate store. + // In this case we don't need any further transformations. + if (TLI.isTruncStoreLegal(VT, StVT)) + return SDValue(); + + // From, To sizes and ElemCount must be pow of two + assert (isPowerOf2_32(NumElems * FromSz * ToSz) && + "Unexpected size for truncating masked store"); + // We are going to use the original vector elt for storing. + // Accumulated smaller vector elements must be a multiple of the store size. + assert (((NumElems * FromSz) % ToSz) == 0 && + "Unexpected ratio for truncating masked store"); + + unsigned SizeRatio = FromSz / ToSz; + assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), + StVT.getScalarType(), NumElems*SizeRatio); + + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue()); + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + + // Can't shuffle using an illegal type. + assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && + "WideVecVT should be legal"); + + SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); + + SDValue NewMask; + SDValue Mask = Mst->getMask(); + if (Mask.getValueType() == VT) { + // Mask and original value have the same type + NewMask = DAG.getBitcast(WideVecVT, Mask); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) + ShuffleVec[i] = NumElems*SizeRatio; + NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, + DAG.getConstant(0, dl, WideVecVT), + &ShuffleVec[0]); + } + else { + assert(Mask.getValueType().getVectorElementType() == MVT::i1); + unsigned WidenNumElts = NumElems*SizeRatio; + unsigned MaskNumElts = VT.getVectorNumElements(); + EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + WidenNumElts); + + unsigned NumConcat = WidenNumElts / MaskNumElts; + SmallVector<SDValue, 16> Ops(NumConcat); + SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); + Ops[0] = Mask; + for (unsigned i = 1; i != NumConcat; ++i) + Ops[i] = ZeroVal; + + NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); + } + + return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, + Mst->getBasePtr(), NewMask, StVT, + Mst->getMemOperand(), false); +} +/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. +static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + StoreSDNode *St = cast<StoreSDNode>(N); + EVT VT = St->getValue().getValueType(); + EVT StVT = St->getMemoryVT(); + SDLoc dl(St); + SDValue StoredVal = St->getOperand(1); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // If we are saving a concatenation of two XMM registers and 32-byte stores + // are slow, such as on Sandy Bridge, perform two 16-byte stores. + bool Fast; + unsigned AddressSpace = St->getAddressSpace(); + unsigned Alignment = St->getAlignment(); + if (VT.is256BitVector() && StVT == VT && + TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, + AddressSpace, Alignment, &Fast) && !Fast) { + unsigned NumElems = VT.getVectorNumElements(); + if (NumElems < 2) + return SDValue(); + + SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl); + SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl); + + SDValue Stride = + DAG.getConstant(16, dl, TLI.getPointerTy(DAG.getDataLayout())); + SDValue Ptr0 = St->getBasePtr(); + SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); + + SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), Alignment); + SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), + std::min(16U, Alignment)); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); + } + + // Optimize trunc store (of multiple scalars) to shuffle and store. + // First, pack all of the elements in one place. Next, store to memory + // in fewer chunks. + if (St->isTruncatingStore() && VT.isVector()) { + // Check if we can detect an AVG pattern from the truncation. If yes, + // replace the trunc store by a normal store with the result of X86ISD::AVG + // instruction. + SDValue Avg = + detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl); + if (Avg.getNode()) + return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned NumElems = VT.getVectorNumElements(); + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromSz = VT.getVectorElementType().getSizeInBits(); + unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + + // The truncating store is legal in some cases. For example + // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw + // are designated for truncate store. + // In this case we don't need any further transformations. + if (TLI.isTruncStoreLegal(VT, StVT)) + return SDValue(); + + // From, To sizes and ElemCount must be pow of two + if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); + // We are going to use the original vector elt for storing. + // Accumulated smaller vector elements must be a multiple of the store size. + if (0 != (NumElems * FromSz) % ToSz) return SDValue(); + + unsigned SizeRatio = FromSz / ToSz; + + assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), + StVT.getScalarType(), NumElems*SizeRatio); + + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue()); + SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + + // Can't shuffle using an illegal type. + if (!TLI.isTypeLegal(WideVecVT)) + return SDValue(); + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); + // At this point all of the data is stored at the bottom of the + // register. We now need to save it to mem. + + // Find the largest store unit + MVT StoreType = MVT::i8; + for (MVT Tp : MVT::integer_valuetypes()) { + if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) + StoreType = Tp; + } + + // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. + if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && + (64 <= NumElems * ToSz)) + StoreType = MVT::f64; + + // Bitcast the original vector into a vector of store-size units + EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), + StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); + assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); + SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff); + SmallVector<SDValue, 8> Chains; + SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, dl, + TLI.getPointerTy(DAG.getDataLayout())); + SDValue Ptr = St->getBasePtr(); + + // Perform one or more big stores into memory. + for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { + SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + StoreType, ShuffWide, + DAG.getIntPtrConstant(i, dl)); + SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + Chains.push_back(Ch); + } + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + } + + // Turn load->store of MMX types into GPR load/stores. This avoids clobbering + // the FP state in cases where an emms may be missing. + // A preferable solution to the general problem is to figure out the right + // places to insert EMMS. This qualifies as a quick hack. + + // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. + if (VT.getSizeInBits() != 64) + return SDValue(); + + const Function *F = DAG.getMachineFunction().getFunction(); + bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat); + bool F64IsLegal = + !Subtarget->useSoftFloat() && !NoImplicitFloatOps && Subtarget->hasSSE2(); + if ((VT.isVector() || + (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && + isa<LoadSDNode>(St->getValue()) && + !cast<LoadSDNode>(St->getValue())->isVolatile() && + St->getChain().hasOneUse() && !St->isVolatile()) { + SDNode* LdVal = St->getValue().getNode(); + LoadSDNode *Ld = nullptr; + int TokenFactorIndex = -1; + SmallVector<SDValue, 8> Ops; + SDNode* ChainVal = St->getChain().getNode(); + // Must be a store of a load. We currently handle two cases: the load + // is a direct child, and it's under an intervening TokenFactor. It is + // possible to dig deeper under nested TokenFactors. + if (ChainVal == LdVal) + Ld = cast<LoadSDNode>(St->getChain()); + else if (St->getValue().hasOneUse() && + ChainVal->getOpcode() == ISD::TokenFactor) { + for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) { + if (ChainVal->getOperand(i).getNode() == LdVal) { + TokenFactorIndex = i; + Ld = cast<LoadSDNode>(St->getValue()); + } else + Ops.push_back(ChainVal->getOperand(i)); + } + } + + if (!Ld || !ISD::isNormalLoad(Ld)) + return SDValue(); + + // If this is not the MMX case, i.e. we are just turning i64 load/store + // into f64 load/store, avoid the transformation if there are multiple + // uses of the loaded value. + if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) + return SDValue(); + + SDLoc LdDL(Ld); + SDLoc StDL(N); + // If we are a 64-bit capable x86, lower to a single movq load/store pair. + // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store + // pair instead. + if (Subtarget->is64Bit() || F64IsLegal) { + MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; + SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->isVolatile(), + Ld->isNonTemporal(), Ld->isInvariant(), + Ld->getAlignment()); + SDValue NewChain = NewLd.getValue(1); + if (TokenFactorIndex != -1) { + Ops.push_back(NewChain); + NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); + } + return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), + St->getPointerInfo(), + St->isVolatile(), St->isNonTemporal(), + St->getAlignment()); + } + + // Otherwise, lower to two pairs of 32-bit loads / stores. + SDValue LoAddr = Ld->getBasePtr(); + SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, + DAG.getConstant(4, LdDL, MVT::i32)); + + SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, + Ld->getPointerInfo(), + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->isInvariant(), Ld->getAlignment()); + SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, + Ld->getPointerInfo().getWithOffset(4), + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->isInvariant(), + MinAlign(Ld->getAlignment(), 4)); + + SDValue NewChain = LoLd.getValue(1); + if (TokenFactorIndex != -1) { + Ops.push_back(LoLd); + Ops.push_back(HiLd); + NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); + } + + LoAddr = St->getBasePtr(); + HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, + DAG.getConstant(4, StDL, MVT::i32)); + + SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, + St->getPointerInfo(), + St->isVolatile(), St->isNonTemporal(), + St->getAlignment()); + SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, + St->getPointerInfo().getWithOffset(4), + St->isVolatile(), + St->isNonTemporal(), + MinAlign(St->getAlignment(), 4)); + return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); + } + + // This is similar to the above case, but here we handle a scalar 64-bit + // integer store that is extracted from a vector on a 32-bit target. + // If we have SSE2, then we can treat it like a floating-point double + // to get past legalization. The execution dependencies fixup pass will + // choose the optimal machine instruction for the store if this really is + // an integer or v2f32 rather than an f64. + if (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit() && + St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + SDValue OldExtract = St->getOperand(1); + SDValue ExtOp0 = OldExtract.getOperand(0); + unsigned VecSize = ExtOp0.getValueSizeInBits(); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64); + SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0); + SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, + BitCast, OldExtract.getOperand(1)); + return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(), + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + } + + return SDValue(); +} + +/// Return 'true' if this vector operation is "horizontal" +/// and return the operands for the horizontal operation in LHS and RHS. A +/// horizontal operation performs the binary operation on successive elements +/// of its first operand, then on successive elements of its second operand, +/// returning the resulting values in a vector. For example, if +/// A = < float a0, float a1, float a2, float a3 > +/// and +/// B = < float b0, float b1, float b2, float b3 > +/// then the result of doing a horizontal operation on A and B is +/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. +/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form +/// A horizontal-op B, for some already available A and B, and if so then LHS is +/// set to A, RHS to B, and the routine returns 'true'. +/// Note that the binary operation should have the property that if one of the +/// operands is UNDEF then the result is UNDEF. +static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { + // Look for the following pattern: if + // A = < float a0, float a1, float a2, float a3 > + // B = < float b0, float b1, float b2, float b3 > + // and + // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> + // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> + // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > + // which is A horizontal-op B. + + // At least one of the operands should be a vector shuffle. + if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && + RHS.getOpcode() != ISD::VECTOR_SHUFFLE) + return false; + + MVT VT = LHS.getSimpleValueType(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for horizontal add/sub"); + + // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to + // operate independently on 128-bit lanes. + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts / NumLanes; + assert((NumLaneElts % 2 == 0) && + "Vector type should have an even number of elements in each lane"); + unsigned HalfLaneElts = NumLaneElts/2; + + // View LHS in the form + // LHS = VECTOR_SHUFFLE A, B, LMask + // If LHS is not a shuffle then pretend it is the shuffle + // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> + // NOTE: in what follows a default initialized SDValue represents an UNDEF of + // type VT. + SDValue A, B; + SmallVector<int, 16> LMask(NumElts); + if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { + if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) + A = LHS.getOperand(0); + if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) + B = LHS.getOperand(1); + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(); + std::copy(Mask.begin(), Mask.end(), LMask.begin()); + } else { + if (LHS.getOpcode() != ISD::UNDEF) + A = LHS; + for (unsigned i = 0; i != NumElts; ++i) + LMask[i] = i; + } + + // Likewise, view RHS in the form + // RHS = VECTOR_SHUFFLE C, D, RMask + SDValue C, D; + SmallVector<int, 16> RMask(NumElts); + if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { + if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) + C = RHS.getOperand(0); + if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) + D = RHS.getOperand(1); + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(); + std::copy(Mask.begin(), Mask.end(), RMask.begin()); + } else { + if (RHS.getOpcode() != ISD::UNDEF) + C = RHS; + for (unsigned i = 0; i != NumElts; ++i) + RMask[i] = i; + } + + // Check that the shuffles are both shuffling the same vectors. + if (!(A == C && B == D) && !(A == D && B == C)) + return false; + + // If everything is UNDEF then bail out: it would be better to fold to UNDEF. + if (!A.getNode() && !B.getNode()) + return false; + + // If A and B occur in reverse order in RHS, then "swap" them (which means + // rewriting the mask). + if (A != C) + ShuffleVectorSDNode::commuteMask(RMask); + + // At this point LHS and RHS are equivalent to + // LHS = VECTOR_SHUFFLE A, B, LMask + // RHS = VECTOR_SHUFFLE A, B, RMask + // Check that the masks correspond to performing a horizontal operation. + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0; i != NumLaneElts; ++i) { + int LIdx = LMask[i+l], RIdx = RMask[i+l]; + + // Ignore any UNDEF components. + if (LIdx < 0 || RIdx < 0 || + (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || + (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) + continue; + + // Check that successive elements are being operated on. If not, this is + // not a horizontal operation. + unsigned Src = (i/HalfLaneElts); // each lane is split between srcs + int Index = 2*(i%HalfLaneElts) + NumElts*Src + l; + if (!(LIdx == Index && RIdx == Index + 1) && + !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) + return false; + } + } + + LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. + RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. + return true; +} + +/// Do target-specific dag combines on floating point adds. +static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // Try to synthesize horizontal adds from adds of shuffles. + if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && + isHorizontalBinOp(LHS, RHS, true)) + return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS); + return SDValue(); +} + +/// Do target-specific dag combines on floating point subs. +static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // Try to synthesize horizontal subs from subs of shuffles. + if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || + (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && + isHorizontalBinOp(LHS, RHS, false)) + return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS); + return SDValue(); +} + +/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS. +static SDValue +combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG, + SmallVector<SDValue, 8> &Regs) { + assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 || + Regs[0].getValueType() == MVT::v2i64)); + EVT OutVT = N->getValueType(0); + EVT OutSVT = OutVT.getVectorElementType(); + EVT InVT = Regs[0].getValueType(); + EVT InSVT = InVT.getVectorElementType(); + SDLoc DL(N); + + // First, use mask to unset all bits that won't appear in the result. + assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) && + "OutSVT can only be either i8 or i16."); + SDValue MaskVal = + DAG.getConstant(OutSVT == MVT::i8 ? 0xFF : 0xFFFF, DL, InSVT); + SDValue MaskVec = DAG.getNode( + ISD::BUILD_VECTOR, DL, InVT, + SmallVector<SDValue, 8>(InVT.getVectorNumElements(), MaskVal)); + for (auto &Reg : Regs) + Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg); + + MVT UnpackedVT, PackedVT; + if (OutSVT == MVT::i8) { + UnpackedVT = MVT::v8i16; + PackedVT = MVT::v16i8; + } else { + UnpackedVT = MVT::v4i32; + PackedVT = MVT::v8i16; + } + + // In each iteration, truncate the type by a half size. + auto RegNum = Regs.size(); + for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits(); + j < e; j *= 2, RegNum /= 2) { + for (unsigned i = 0; i < RegNum; i++) + Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]); + for (unsigned i = 0; i < RegNum / 2; i++) + Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2], + Regs[i * 2 + 1]); + } + + // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and + // then extract a subvector as the result since v8i8 is not a legal type. + if (OutVT == MVT::v8i8) { + Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]); + Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0], + DAG.getIntPtrConstant(0, DL)); + return Regs[0]; + } else if (RegNum > 1) { + Regs.resize(RegNum); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); + } else + return Regs[0]; +} + +/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS. +static SDValue +combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG, + SmallVector<SDValue, 8> &Regs) { + assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32); + EVT OutVT = N->getValueType(0); + SDLoc DL(N); + + // Shift left by 16 bits, then arithmetic-shift right by 16 bits. + SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32); + for (auto &Reg : Regs) { + Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG); + Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG); + } + + for (unsigned i = 0, e = Regs.size() / 2; i < e; i++) + Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2], + Regs[i * 2 + 1]); + + if (Regs.size() > 2) { + Regs.resize(Regs.size() / 2); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); + } else + return Regs[0]; +} + +/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into +/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type +/// legalization the truncation will be translated into a BUILD_VECTOR with each +/// element that is extracted from a vector and then truncated, and it is +/// diffcult to do this optimization based on them. +static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT OutVT = N->getValueType(0); + if (!OutVT.isVector()) + return SDValue(); + + SDValue In = N->getOperand(0); + if (!In.getValueType().isSimple()) + return SDValue(); + + EVT InVT = In.getValueType(); + unsigned NumElems = OutVT.getVectorNumElements(); + + // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on + // SSE2, and we need to take care of it specially. + // AVX512 provides vpmovdb. + if (!Subtarget->hasSSE2() || Subtarget->hasAVX2()) + return SDValue(); + + EVT OutSVT = OutVT.getVectorElementType(); + EVT InSVT = InVT.getVectorElementType(); + if (!((InSVT == MVT::i32 || InSVT == MVT::i64) && + (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) && + NumElems >= 8)) + return SDValue(); + + // SSSE3's pshufb results in less instructions in the cases below. + if (Subtarget->hasSSSE3() && NumElems == 8 && + ((OutSVT == MVT::i8 && InSVT != MVT::i64) || + (InSVT == MVT::i32 && OutSVT == MVT::i16))) + return SDValue(); + + SDLoc DL(N); + + // Split a long vector into vectors of legal type. + unsigned RegNum = InVT.getSizeInBits() / 128; + SmallVector<SDValue, 8> SubVec(RegNum); + if (InSVT == MVT::i32) { + for (unsigned i = 0; i < RegNum; i++) + SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, + DAG.getIntPtrConstant(i * 4, DL)); + } else { + for (unsigned i = 0; i < RegNum; i++) + SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, + DAG.getIntPtrConstant(i * 2, DL)); + } + + // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS + // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to + // truncate 2 x v4i32 to v8i16. + if (Subtarget->hasSSE41() || OutSVT == MVT::i8) + return combineVectorTruncationWithPACKUS(N, DAG, SubVec); + else if (InSVT == MVT::i32) + return combineVectorTruncationWithPACKSS(N, DAG, SubVec); + else + return SDValue(); +} + +static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + // Try to detect AVG pattern first. + SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, + Subtarget, SDLoc(N)); + if (Avg.getNode()) + return Avg; + + return combineVectorTruncation(N, DAG, Subtarget); +} + +/// Do target-specific dag combines on floating point negations. +static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + EVT SVT = VT.getScalarType(); + SDValue Arg = N->getOperand(0); + SDLoc DL(N); + + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + // If we're negating a FMUL node on a target with FMA, then we can avoid the + // use of a constant by performing (-0 - A*B) instead. + // FIXME: Check rounding control flags as well once it becomes available. + if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && + Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) { + SDValue Zero = DAG.getConstantFP(0.0, DL, VT); + return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Zero); + } + + // If we're negating a FMA node, then we can adjust the + // instruction to include the extra negation. + if (Arg.hasOneUse()) { + switch (Arg.getOpcode()) { + case X86ISD::FMADD: + return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FMSUB: + return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FNMADD: + return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FNMSUB: + return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + } + } + return SDValue(); +} + +static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (VT.is512BitVector() && !Subtarget->hasDQI()) { + // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention. + // These logic operations may be executed in the integer domain. + SDLoc dl(N); + MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits()); + MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements()); + + SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0)); + SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1)); + unsigned IntOpcode = 0; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected FP logic op"); + case X86ISD::FOR: IntOpcode = ISD::OR; break; + case X86ISD::FXOR: IntOpcode = ISD::XOR; break; + case X86ISD::FAND: IntOpcode = ISD::AND; break; + case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break; + } + SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1); + return DAG.getNode(ISD::BITCAST, dl, VT, IntOp); + } + return SDValue(); +} +/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. +static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); + + // F[X]OR(0.0, x) -> x + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(1); + + // F[X]OR(x, 0.0) -> x + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(0); + + return lowerX86FPLogicOp(N, DAG, Subtarget); +} + +/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes. +static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { + assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); + + // Only perform optimizations if UnsafeMath is used. + if (!DAG.getTarget().Options.UnsafeFPMath) + return SDValue(); + + // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes + // into FMINC and FMAXC, which are Commutative operations. + unsigned NewOp = 0; + switch (N->getOpcode()) { + default: llvm_unreachable("unknown opcode"); + case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; + case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; + } + + return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), + N->getOperand(0), N->getOperand(1)); +} + +static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + if (Subtarget->useSoftFloat()) + return SDValue(); + + // TODO: Check for global or instruction-level "nnan". In that case, we + // should be able to lower to FMAX/FMIN alone. + // TODO: If an operand is already known to be a NaN or not a NaN, this + // should be an optional swap and FMAX/FMIN. + + EVT VT = N->getValueType(0); + if (!((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || + (Subtarget->hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64)))) + return SDValue(); + + // This takes at least 3 instructions, so favor a library call when operating + // on a scalar and minimizing code size. + if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize()) + return SDValue(); + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDLoc DL(N); + EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType( + DAG.getDataLayout(), *DAG.getContext(), VT); + + // There are 4 possibilities involving NaN inputs, and these are the required + // outputs: + // Op1 + // Num NaN + // ---------------- + // Num | Max | Op0 | + // Op0 ---------------- + // NaN | Op1 | NaN | + // ---------------- + // + // The SSE FP max/min instructions were not designed for this case, but rather + // to implement: + // Min = Op1 < Op0 ? Op1 : Op0 + // Max = Op1 > Op0 ? Op1 : Op0 + // + // So they always return Op0 if either input is a NaN. However, we can still + // use those instructions for fmaxnum by selecting away a NaN input. + + // If either operand is NaN, the 2nd source operand (Op0) is passed through. + auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN; + SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0); + SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO); + + // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands + // are NaN, the NaN value of Op1 is the result. + auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; + return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax); +} + +/// Do target-specific dag combines on X86ISD::FAND nodes. +static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + // FAND(0.0, x) -> 0.0 + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(0); + + // FAND(x, 0.0) -> 0.0 + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(1); + + return lowerX86FPLogicOp(N, DAG, Subtarget); +} + +/// Do target-specific dag combines on X86ISD::FANDN nodes +static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + // FANDN(0.0, x) -> x + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(1); + + // FANDN(x, 0.0) -> 0.0 + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(1); + + return lowerX86FPLogicOp(N, DAG, Subtarget); +} + +static SDValue PerformBTCombine(SDNode *N, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + // BT ignores high bits in the bit index operand. + SDValue Op1 = N->getOperand(1); + if (Op1.hasOneUse()) { + unsigned BitWidth = Op1.getValueSizeInBits(); + APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || + TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) + DCI.CommitTargetLoweringOpt(TLO); + } + return SDValue(); +} + +static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { + SDValue Op = N->getOperand(0); + if (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + EVT VT = N->getValueType(0), OpVT = Op.getValueType(); + if (Op.getOpcode() == X86ISD::VZEXT_LOAD && + VT.getVectorElementType().getSizeInBits() == + OpVT.getVectorElementType().getSizeInBits()) { + return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); + } + return SDValue(); +} + +static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (!VT.isVector()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT ExtraVT = cast<VTSDNode>(N1)->getVT(); + SDLoc dl(N); + + // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the + // both SSE and AVX2 since there is no sign-extended shift right + // operation on a vector with 64-bit elements. + //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> + // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) + if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND || + N0.getOpcode() == ISD::SIGN_EXTEND)) { + SDValue N00 = N0.getOperand(0); + + // EXTLOAD has a better solution on AVX2, + // it may be replaced with X86ISD::VSEXT node. + if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256()) + if (!ISD::isNormalLoad(N00.getNode())) + return SDValue(); + + if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { + SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, + N00, N1); + return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); + } + } + return SDValue(); +} + +/// sext(add_nsw(x, C)) --> add(sext(x), C_sext) +/// Promoting a sign extension ahead of an 'add nsw' exposes opportunities +/// to combine math ops, use an LEA, or use a complex addressing mode. This can +/// eliminate extend, add, and shift instructions. +static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + // TODO: This should be valid for other integer types. + EVT VT = Sext->getValueType(0); + if (VT != MVT::i64) + return SDValue(); + + // We need an 'add nsw' feeding into the 'sext'. + SDValue Add = Sext->getOperand(0); + if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap()) + return SDValue(); + + // Having a constant operand to the 'add' ensures that we are not increasing + // the instruction count because the constant is extended for free below. + // A constant operand can also become the displacement field of an LEA. + auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1)); + if (!AddOp1) + return SDValue(); + + // Don't make the 'add' bigger if there's no hope of combining it with some + // other 'add' or 'shl' instruction. + // TODO: It may be profitable to generate simpler LEA instructions in place + // of single 'add' instructions, but the cost model for selecting an LEA + // currently has a high threshold. + bool HasLEAPotential = false; + for (auto *User : Sext->uses()) { + if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) { + HasLEAPotential = true; + break; + } + } + if (!HasLEAPotential) + return SDValue(); + + // Everything looks good, so pull the 'sext' ahead of the 'add'. + int64_t AddConstant = AddOp1->getSExtValue(); + SDValue AddOp0 = Add.getOperand(0); + SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0); + SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT); + + // The wider add is guaranteed to not wrap because both operands are + // sign-extended. + SDNodeFlags Flags; + Flags.setNoSignedWrap(true); + return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags); +} + +static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + EVT SVT = VT.getScalarType(); + EVT InVT = N0.getValueType(); + EVT InSVT = InVT.getScalarType(); + SDLoc DL(N); + + // (i8,i32 sext (sdivrem (i8 x, i8 y)) -> + // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y) + // This exposes the sext to the sdivrem lowering, so that it directly extends + // from AH (which we otherwise need to do contortions to access). + if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 && + InVT == MVT::i8 && VT == MVT::i32) { + SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); + SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, DL, NodeTys, + N0.getOperand(0), N0.getOperand(1)); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); + return R.getValue(1); + } + + if (!DCI.isBeforeLegalizeOps()) { + if (InVT == MVT::i1) { + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue AllOnes = + DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT); + return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero); + } + return SDValue(); + } + + if (VT.isVector() && Subtarget->hasSSE2()) { + auto ExtendVecSize = [&DAG](SDLoc DL, SDValue N, unsigned Size) { + EVT InVT = N.getValueType(); + EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(), + Size / InVT.getScalarSizeInBits()); + SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(), + DAG.getUNDEF(InVT)); + Opnds[0] = N; + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds); + }; + + // If target-size is less than 128-bits, extend to a type that would extend + // to 128 bits, extend that and extract the original target vector. + if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits()) && + (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && + (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { + unsigned Scale = 128 / VT.getSizeInBits(); + EVT ExVT = + EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits()); + SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits()); + SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, ExVT, Ex); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt, + DAG.getIntPtrConstant(0, DL)); + } + + // If target-size is 128-bits, then convert to ISD::SIGN_EXTEND_VECTOR_INREG + // which ensures lowering to X86ISD::VSEXT (pmovsx*). + if (VT.getSizeInBits() == 128 && + (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && + (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { + SDValue ExOp = ExtendVecSize(DL, N0, 128); + return DAG.getSignExtendVectorInReg(ExOp, DL, VT); + } + + // On pre-AVX2 targets, split into 128-bit nodes of + // ISD::SIGN_EXTEND_VECTOR_INREG. + if (!Subtarget->hasInt256() && !(VT.getSizeInBits() % 128) && + (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && + (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { + unsigned NumVecs = VT.getSizeInBits() / 128; + unsigned NumSubElts = 128 / SVT.getSizeInBits(); + EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); + EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); + + SmallVector<SDValue, 8> Opnds; + for (unsigned i = 0, Offset = 0; i != NumVecs; + ++i, Offset += NumSubElts) { + SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, + DAG.getIntPtrConstant(Offset, DL)); + SrcVec = ExtendVecSize(DL, SrcVec, 128); + SrcVec = DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT); + Opnds.push_back(SrcVec); + } + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); + } + } + + if (Subtarget->hasAVX() && VT.is256BitVector()) + if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) + return R; + + if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget)) + return NewAdd; + + return SDValue(); +} + +static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget* Subtarget) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + EVT ScalarVT = VT.getScalarType(); + if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA()) + return SDValue(); + + SDValue A = N->getOperand(0); + SDValue B = N->getOperand(1); + SDValue C = N->getOperand(2); + + bool NegA = (A.getOpcode() == ISD::FNEG); + bool NegB = (B.getOpcode() == ISD::FNEG); + bool NegC = (C.getOpcode() == ISD::FNEG); + + // Negative multiplication when NegA xor NegB + bool NegMul = (NegA != NegB); + if (NegA) + A = A.getOperand(0); + if (NegB) + B = B.getOperand(0); + if (NegC) + C = C.getOperand(0); + + unsigned Opcode; + if (!NegMul) + Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; + else + Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; + + return DAG.getNode(Opcode, dl, VT, A, B, C); +} + +static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> + // (and (i32 x86isd::setcc_carry), 1) + // This eliminates the zext. This transformation is necessary because + // ISD::SETCC is always legalized to i8. + SDLoc dl(N); + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + if (N0.getOpcode() == ISD::AND && + N0.hasOneUse() && + N0.getOperand(0).hasOneUse()) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == X86ISD::SETCC_CARRY) { + if (!isOneConstant(N0.getOperand(1))) + return SDValue(); + return DAG.getNode(ISD::AND, dl, VT, + DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, + N00.getOperand(0), N00.getOperand(1)), + DAG.getConstant(1, dl, VT)); + } + } + + if (N0.getOpcode() == ISD::TRUNCATE && + N0.hasOneUse() && + N0.getOperand(0).hasOneUse()) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == X86ISD::SETCC_CARRY) { + return DAG.getNode(ISD::AND, dl, VT, + DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, + N00.getOperand(0), N00.getOperand(1)), + DAG.getConstant(1, dl, VT)); + } + } + + if (VT.is256BitVector()) + if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) + return R; + + // (i8,i32 zext (udivrem (i8 x, i8 y)) -> + // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y) + // This exposes the zext to the udivrem lowering, so that it directly extends + // from AH (which we otherwise need to do contortions to access). + if (N0.getOpcode() == ISD::UDIVREM && + N0.getResNo() == 1 && N0.getValueType() == MVT::i8 && + VT == MVT::i32) { + SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); + SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys, + N0.getOperand(0), N0.getOperand(1)); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); + return R.getValue(1); + } + + return SDValue(); +} + +// Optimize x == -y --> x+y == 0 +// x != -y --> x+y != 0 +static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget* Subtarget) { + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + EVT VT = N->getValueType(0); + SDLoc DL(N); + + if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) + if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) { + SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, + LHS.getOperand(1)); + return DAG.getSetCC(DL, N->getValueType(0), addV, + DAG.getConstant(0, DL, addV.getValueType()), CC); + } + if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) + if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) { + SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, + RHS.getOperand(1)); + return DAG.getSetCC(DL, N->getValueType(0), addV, + DAG.getConstant(0, DL, addV.getValueType()), CC); + } + + if (VT.getScalarType() == MVT::i1 && + (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { + bool IsSEXT0 = + (LHS.getOpcode() == ISD::SIGN_EXTEND) && + (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); + bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); + + if (!IsSEXT0 || !IsVZero1) { + // Swap the operands and update the condition code. + std::swap(LHS, RHS); + CC = ISD::getSetCCSwappedOperands(CC); + + IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) && + (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); + IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); + } + + if (IsSEXT0 && IsVZero1) { + assert(VT == LHS.getOperand(0).getValueType() && + "Uexpected operand type"); + if (CC == ISD::SETGT) + return DAG.getConstant(0, DL, VT); + if (CC == ISD::SETLE) + return DAG.getConstant(1, DL, VT); + if (CC == ISD::SETEQ || CC == ISD::SETGE) + return DAG.getNOT(DL, LHS.getOperand(0), VT); + + assert((CC == ISD::SETNE || CC == ISD::SETLT) && + "Unexpected condition code!"); + return LHS.getOperand(0); + } + } + + return SDValue(); +} + +static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + // Gather and Scatter instructions use k-registers for masks. The type of + // the masks is v*i1. So the mask will be truncated anyway. + // The SIGN_EXTEND_INREG my be dropped. + SDValue Mask = N->getOperand(2); + if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) { + SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); + NewOps[2] = Mask.getOperand(0); + DAG.UpdateNodeOperands(N, NewOps); + } + return SDValue(); +} + +// Helper function of PerformSETCCCombine. It is to materialize "setb reg" +// as "sbb reg,reg", since it can be extended without zext and produces +// an all-ones bit which is more useful than 0/1 in some cases. +static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG, + MVT VT) { + if (VT == MVT::i8) + return DAG.getNode(ISD::AND, DL, VT, + DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, + DAG.getConstant(X86::COND_B, DL, MVT::i8), + EFLAGS), + DAG.getConstant(1, DL, VT)); + assert (VT == MVT::i1 && "Unexpected type for SECCC node"); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, + DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, + DAG.getConstant(X86::COND_B, DL, MVT::i8), + EFLAGS)); +} + +// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT +static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); + SDValue EFLAGS = N->getOperand(1); + + if (CC == X86::COND_A) { + // Try to convert COND_A into COND_B in an attempt to facilitate + // materializing "setb reg". + // + // Do not flip "e > c", where "c" is a constant, because Cmp instruction + // cannot take an immediate as its first operand. + // + if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && + EFLAGS.getValueType().isInteger() && + !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { + SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), + EFLAGS.getNode()->getVTList(), + EFLAGS.getOperand(1), EFLAGS.getOperand(0)); + SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); + return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0)); + } + } + + // Materialize "setb reg" as "sbb reg,reg", since it can be extended without + // a zext and produces an all-ones bit which is more useful than 0/1 in some + // cases. + if (CC == X86::COND_B) + return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0)); + + if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) { + SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); + return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); + } + + return SDValue(); +} + +// Optimize branch condition evaluation. +// +static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + SDValue Dest = N->getOperand(1); + SDValue EFLAGS = N->getOperand(3); + X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); + + if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) { + SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); + return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, + Flags); + } + + return SDValue(); +} + +static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, + SelectionDAG &DAG) { + // Take advantage of vector comparisons producing 0 or -1 in each lane to + // optimize away operation when it's from a constant. + // + // The general transformation is: + // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> + // AND(VECTOR_CMP(x,y), constant2) + // constant2 = UNARYOP(constant) + + // Early exit if this isn't a vector operation, the operand of the + // unary operation isn't a bitwise AND, or if the sizes of the operations + // aren't the same. + EVT VT = N->getValueType(0); + if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || + N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || + VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) + return SDValue(); + + // Now check that the other operand of the AND is a constant. We could + // make the transformation for non-constant splats as well, but it's unclear + // that would be a benefit as it would not eliminate any operations, just + // perform one more step in scalar code before moving to the vector unit. + if (BuildVectorSDNode *BV = + dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { + // Bail out if the vector isn't a constant. + if (!BV->isConstant()) + return SDValue(); + + // Everything checks out. Build up the new and improved node. + SDLoc DL(N); + EVT IntVT = BV->getValueType(0); + // Create a new constant of the appropriate type for the transformed + // DAG. + SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); + // The AND node needs bitcasts to/from an integer vector type around it. + SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst); + SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, + N->getOperand(0)->getOperand(0), MaskConst); + SDValue Res = DAG.getBitcast(VT, NewAnd); + return Res; + } + + return SDValue(); +} + +static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDValue Op0 = N->getOperand(0); + EVT VT = N->getValueType(0); + EVT InVT = Op0.getValueType(); + EVT InSVT = InVT.getScalarType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32)) + // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32)) + if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) { + SDLoc dl(N); + EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + InVT.getVectorNumElements()); + SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); + + if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT)) + return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P); + + return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); + } + + return SDValue(); +} + +static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + // First try to optimize away the conversion entirely when it's + // conditionally from a constant. Vectors only. + if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) + return Res; + + // Now move on to more general possibilities. + SDValue Op0 = N->getOperand(0); + EVT VT = N->getValueType(0); + EVT InVT = Op0.getValueType(); + EVT InSVT = InVT.getScalarType(); + + // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32)) + // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32)) + if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) { + SDLoc dl(N); + EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + InVT.getVectorNumElements()); + SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); + return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); + } + + // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have + // a 32-bit target where SSE doesn't support i64->FP operations. + if (!Subtarget->useSoftFloat() && Op0.getOpcode() == ISD::LOAD) { + LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); + EVT LdVT = Ld->getValueType(0); + + // This transformation is not supported if the result type is f16 + if (VT == MVT::f16) + return SDValue(); + + if (!Ld->isVolatile() && !VT.isVector() && + ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && + !Subtarget->is64Bit() && LdVT == MVT::i64) { + SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD( + SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG); + DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); + return FILDChain; + } + } + return SDValue(); +} + +// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS +static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, + X86TargetLowering::DAGCombinerInfo &DCI) { + // If the LHS and RHS of the ADC node are zero, then it can't overflow and + // the result is either zero or one (depending on the input carry bit). + // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. + if (X86::isZeroNode(N->getOperand(0)) && + X86::isZeroNode(N->getOperand(1)) && + // We don't have a good way to replace an EFLAGS use, so only do this when + // dead right now. + SDValue(N, 1).use_empty()) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1)); + SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, + DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getConstant(X86::COND_B, DL, + MVT::i8), + N->getOperand(2)), + DAG.getConstant(1, DL, VT)); + return DCI.CombineTo(N, Res1, CarryOut); + } + + return SDValue(); +} + +// fold (add Y, (sete X, 0)) -> adc 0, Y +// (add Y, (setne X, 0)) -> sbb -1, Y +// (sub (sete X, 0), Y) -> sbb 0, Y +// (sub (setne X, 0), Y) -> adc -1, Y +static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + + // Look through ZExts. + SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); + if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) + return SDValue(); + + SDValue SetCC = Ext.getOperand(0); + if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) + return SDValue(); + + X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); + if (CC != X86::COND_E && CC != X86::COND_NE) + return SDValue(); + + SDValue Cmp = SetCC.getOperand(1); + if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || + !X86::isZeroNode(Cmp.getOperand(1)) || + !Cmp.getOperand(0).getValueType().isInteger()) + return SDValue(); + + SDValue CmpOp0 = Cmp.getOperand(0); + SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, + DAG.getConstant(1, DL, CmpOp0.getValueType())); + + SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); + if (CC == X86::COND_NE) + return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, + DL, OtherVal.getValueType(), OtherVal, + DAG.getConstant(-1ULL, DL, OtherVal.getValueType()), + NewCmp); + return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, + DL, OtherVal.getValueType(), OtherVal, + DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp); +} + +/// PerformADDCombine - Do target-specific dag combines on integer adds. +static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + // Try to synthesize horizontal adds from adds of shuffles. + if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || + (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && + isHorizontalBinOp(Op0, Op1, true)) + return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); + + return OptimizeConditionalInDecrement(N, DAG); +} + +static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + // X86 can't encode an immediate LHS of a sub. See if we can push the + // negation into a preceding instruction. + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { + // If the RHS of the sub is a XOR with one use and a constant, invert the + // immediate. Then add one to the LHS of the sub so we can turn + // X-Y -> X+~Y+1, saving one register. + if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && + isa<ConstantSDNode>(Op1.getOperand(1))) { + APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); + EVT VT = Op0.getValueType(); + SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, + Op1.getOperand(0), + DAG.getConstant(~XorC, SDLoc(Op1), VT)); + return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor, + DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT)); + } + } + + // Try to synthesize horizontal adds from adds of shuffles. + EVT VT = N->getValueType(0); + if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) || + (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) && + isHorizontalBinOp(Op0, Op1, true)) + return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); + + return OptimizeConditionalInDecrement(N, DAG); +} + +/// performVZEXTCombine - Performs build vector combines +static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + MVT VT = N->getSimpleValueType(0); + SDValue Op = N->getOperand(0); + MVT OpVT = Op.getSimpleValueType(); + MVT OpEltVT = OpVT.getVectorElementType(); + unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements(); + + // (vzext (bitcast (vzext (x)) -> (vzext x) + SDValue V = Op; + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + + if (V != Op && V.getOpcode() == X86ISD::VZEXT) { + MVT InnerVT = V.getSimpleValueType(); + MVT InnerEltVT = InnerVT.getVectorElementType(); + + // If the element sizes match exactly, we can just do one larger vzext. This + // is always an exact type match as vzext operates on integer types. + if (OpEltVT == InnerEltVT) { + assert(OpVT == InnerVT && "Types must match for vzext!"); + return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0)); + } + + // The only other way we can combine them is if only a single element of the + // inner vzext is used in the input to the outer vzext. + if (InnerEltVT.getSizeInBits() < InputBits) + return SDValue(); + + // In this case, the inner vzext is completely dead because we're going to + // only look at bits inside of the low element. Just do the outer vzext on + // a bitcast of the input to the inner. + return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V)); + } + + // Check if we can bypass extracting and re-inserting an element of an input + // vector. Essentially: + // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) + if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && + V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && + V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) { + SDValue ExtractedV = V.getOperand(0); + SDValue OrigV = ExtractedV.getOperand(0); + if (isNullConstant(ExtractedV.getOperand(1))) { + MVT OrigVT = OrigV.getSimpleValueType(); + // Extract a subvector if necessary... + if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) { + int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits(); + OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(), + OrigVT.getVectorNumElements() / Ratio); + OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV, + DAG.getIntPtrConstant(0, DL)); + } + Op = DAG.getBitcast(OpVT, OrigV); + return DAG.getNode(X86ISD::VZEXT, DL, VT, Op); + } + } + + return SDValue(); +} + +SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + switch (N->getOpcode()) { + default: break; + case ISD::EXTRACT_VECTOR_ELT: + return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); + case ISD::VSELECT: + case ISD::SELECT: + case X86ISD::SHRUNKBLEND: + return PerformSELECTCombine(N, DAG, DCI, Subtarget); + case ISD::BITCAST: return PerformBITCASTCombine(N, DAG, Subtarget); + case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); + case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); + case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); + case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); + case ISD::MUL: return PerformMulCombine(N, DAG, DCI); + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: return PerformShiftCombine(N, DAG, DCI, Subtarget); + case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); + case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); + case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); + case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); + case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget); + case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); + case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget); + case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, Subtarget); + case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget); + case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); + case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); + case ISD::FNEG: return PerformFNEGCombine(N, DAG, Subtarget); + case ISD::TRUNCATE: return PerformTRUNCATECombine(N, DAG, Subtarget); + case X86ISD::FXOR: + case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget); + case X86ISD::FMIN: + case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); + case ISD::FMINNUM: + case ISD::FMAXNUM: return performFMinNumFMaxNumCombine(N, DAG, + Subtarget); + case X86ISD::FAND: return PerformFANDCombine(N, DAG, Subtarget); + case X86ISD::FANDN: return PerformFANDNCombine(N, DAG, Subtarget); + case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); + case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); +// TODO: refactor the [SU]DIVREM8_[SZ]EXT_HREG code so that it's not duplicated. + case ISD::ANY_EXTEND: + case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); + case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); + case ISD::SIGN_EXTEND_INREG: + return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); + case ISD::SETCC: return PerformISDSETCCCombine(N, DAG, Subtarget); + case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); + case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); + case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); + case X86ISD::SHUFP: // Handle all target specific shuffles + case X86ISD::PALIGNR: + case X86ISD::BLENDI: + case X86ISD::UNPCKH: + case X86ISD::UNPCKL: + case X86ISD::MOVHLPS: + case X86ISD::MOVLHPS: + case X86ISD::PSHUFB: + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::MOVSS: + case X86ISD::MOVSD: + case X86ISD::VPERMILPI: + case X86ISD::VPERM2X128: + case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); + case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); + case ISD::MGATHER: + case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG); + } + + return SDValue(); +} + +/// isTypeDesirableForOp - Return true if the target has native support for +/// the specified value type and it is 'desirable' to use the type for the +/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 +/// instruction encodings are longer and some i16 instructions are slow. +bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { + if (!isTypeLegal(VT)) + return false; + if (VT != MVT::i16) + return true; + + switch (Opc) { + default: + return true; + case ISD::LOAD: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + case ISD::SHL: + case ISD::SRL: + case ISD::SUB: + case ISD::ADD: + case ISD::MUL: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + return false; + } +} + +/// This function checks if any of the users of EFLAGS copies the EFLAGS. We +/// know that the code that lowers COPY of EFLAGS has to use the stack, and if +/// we don't adjust the stack we clobber the first frame index. +/// See X86InstrInfo::copyPhysReg. +bool X86TargetLowering::hasCopyImplyingStackAdjustment( + MachineFunction *MF) const { + const MachineRegisterInfo &MRI = MF->getRegInfo(); + + return any_of(MRI.reg_instructions(X86::EFLAGS), + [](const MachineInstr &RI) { return RI.isCopy(); }); +} + +/// IsDesirableToPromoteOp - This method query the target whether it is +/// beneficial for dag combiner to promote the specified node. If true, it +/// should return the desired promotion type by reference. +bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { + EVT VT = Op.getValueType(); + if (VT != MVT::i16) + return false; + + bool Promote = false; + bool Commute = false; + switch (Op.getOpcode()) { + default: break; + case ISD::LOAD: { + LoadSDNode *LD = cast<LoadSDNode>(Op); + // If the non-extending load has a single use and it's not live out, then it + // might be folded. + if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& + Op.hasOneUse()*/) { + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = Op.getNode()->use_end(); UI != UE; ++UI) { + // The only case where we'd want to promote LOAD (rather then it being + // promoted as an operand is when it's only use is liveout. + if (UI->getOpcode() != ISD::CopyToReg) + return false; + } + } + Promote = true; + break; + } + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + Promote = true; + break; + case ISD::SHL: + case ISD::SRL: { + SDValue N0 = Op.getOperand(0); + // Look out for (store (shl (load), x)). + if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) + return false; + Promote = true; + break; + } + case ISD::ADD: + case ISD::MUL: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + Commute = true; + // fallthrough + case ISD::SUB: { + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + if (!Commute && MayFoldLoad(N1)) + return false; + // Avoid disabling potential load folding opportunities. + if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) + return false; + if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) + return false; + Promote = true; + } + } + + PVT = MVT::i32; + return Promote; +} + +//===----------------------------------------------------------------------===// +// X86 Inline Assembly Support +//===----------------------------------------------------------------------===// + +// Helper to match a string separated by whitespace. +static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) { + S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace. + + for (StringRef Piece : Pieces) { + if (!S.startswith(Piece)) // Check if the piece matches. + return false; + + S = S.substr(Piece.size()); + StringRef::size_type Pos = S.find_first_not_of(" \t"); + if (Pos == 0) // We matched a prefix. + return false; + + S = S.substr(Pos); + } + + return S.empty(); +} + +static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) { + + if (AsmPieces.size() == 3 || AsmPieces.size() == 4) { + if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") && + std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") && + std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) { + + if (AsmPieces.size() == 3) + return true; + else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}")) + return true; + } + } + return false; +} + +bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { + InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); + + std::string AsmStr = IA->getAsmString(); + + IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); + if (!Ty || Ty->getBitWidth() % 16 != 0) + return false; + + // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" + SmallVector<StringRef, 4> AsmPieces; + SplitString(AsmStr, AsmPieces, ";\n"); + + switch (AsmPieces.size()) { + default: return false; + case 1: + // FIXME: this should verify that we are targeting a 486 or better. If not, + // we will turn this bswap into something that will be lowered to logical + // ops instead of emitting the bswap asm. For now, we don't support 486 or + // lower so don't worry about this. + // bswap $0 + if (matchAsm(AsmPieces[0], {"bswap", "$0"}) || + matchAsm(AsmPieces[0], {"bswapl", "$0"}) || + matchAsm(AsmPieces[0], {"bswapq", "$0"}) || + matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) || + matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) || + matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) { + // No need to check constraints, nothing other than the equivalent of + // "=r,0" would be valid here. + return IntrinsicLowering::LowerToByteSwap(CI); + } + + // rorw $$8, ${0:w} --> llvm.bswap.i16 + if (CI->getType()->isIntegerTy(16) && + IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && + (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) || + matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) { + AsmPieces.clear(); + StringRef ConstraintsStr = IA->getConstraintString(); + SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); + array_pod_sort(AsmPieces.begin(), AsmPieces.end()); + if (clobbersFlagRegisters(AsmPieces)) + return IntrinsicLowering::LowerToByteSwap(CI); + } + break; + case 3: + if (CI->getType()->isIntegerTy(32) && + IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && + matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) && + matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) && + matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) { + AsmPieces.clear(); + StringRef ConstraintsStr = IA->getConstraintString(); + SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); + array_pod_sort(AsmPieces.begin(), AsmPieces.end()); + if (clobbersFlagRegisters(AsmPieces)) + return IntrinsicLowering::LowerToByteSwap(CI); + } + + if (CI->getType()->isIntegerTy(64)) { + InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); + if (Constraints.size() >= 2 && + Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && + Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { + // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 + if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) && + matchAsm(AsmPieces[1], {"bswap", "%edx"}) && + matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"})) + return IntrinsicLowering::LowerToByteSwap(CI); + } + } + break; + } + return false; +} + +/// getConstraintType - Given a constraint letter, return the type of +/// constraint it is for this target. +X86TargetLowering::ConstraintType +X86TargetLowering::getConstraintType(StringRef Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 'R': + case 'q': + case 'Q': + case 'f': + case 't': + case 'u': + case 'y': + case 'x': + case 'Y': + case 'l': + return C_RegisterClass; + case 'a': + case 'b': + case 'c': + case 'd': + case 'S': + case 'D': + case 'A': + return C_Register; + case 'I': + case 'J': + case 'K': + case 'L': + case 'M': + case 'N': + case 'G': + case 'C': + case 'e': + case 'Z': + return C_Other; + default: + break; + } + } + return TargetLowering::getConstraintType(Constraint); +} + +/// Examine constraint type and operand type and determine a weight value. +/// This object must already have been set up with the operand type +/// and the current alternative constraint selected. +TargetLowering::ConstraintWeight + X86TargetLowering::getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const { + ConstraintWeight weight = CW_Invalid; + Value *CallOperandVal = info.CallOperandVal; + // If we don't have a value, we can't do a match, + // but allow it at the lowest weight. + if (!CallOperandVal) + return CW_Default; + Type *type = CallOperandVal->getType(); + // Look at the constraint type. + switch (*constraint) { + default: + weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + case 'R': + case 'q': + case 'Q': + case 'a': + case 'b': + case 'c': + case 'd': + case 'S': + case 'D': + case 'A': + if (CallOperandVal->getType()->isIntegerTy()) + weight = CW_SpecificReg; + break; + case 'f': + case 't': + case 'u': + if (type->isFloatingPointTy()) + weight = CW_SpecificReg; + break; + case 'y': + if (type->isX86_MMXTy() && Subtarget->hasMMX()) + weight = CW_SpecificReg; + break; + case 'x': + case 'Y': + if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) || + ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256())) + weight = CW_Register; + break; + case 'I': + if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { + if (C->getZExtValue() <= 31) + weight = CW_Constant; + } + break; + case 'J': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if (C->getZExtValue() <= 63) + weight = CW_Constant; + } + break; + case 'K': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) + weight = CW_Constant; + } + break; + case 'L': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) + weight = CW_Constant; + } + break; + case 'M': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if (C->getZExtValue() <= 3) + weight = CW_Constant; + } + break; + case 'N': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if (C->getZExtValue() <= 0xff) + weight = CW_Constant; + } + break; + case 'G': + case 'C': + if (isa<ConstantFP>(CallOperandVal)) { + weight = CW_Constant; + } + break; + case 'e': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if ((C->getSExtValue() >= -0x80000000LL) && + (C->getSExtValue() <= 0x7fffffffLL)) + weight = CW_Constant; + } + break; + case 'Z': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if (C->getZExtValue() <= 0xffffffff) + weight = CW_Constant; + } + break; + } + return weight; +} + +/// LowerXConstraint - try to replace an X constraint, which matches anything, +/// with another that has more specific requirements based on the type of the +/// corresponding operand. +const char *X86TargetLowering:: +LowerXConstraint(EVT ConstraintVT) const { + // FP X constraints get lowered to SSE1/2 registers if available, otherwise + // 'f' like normal targets. + if (ConstraintVT.isFloatingPoint()) { + if (Subtarget->hasSSE2()) + return "Y"; + if (Subtarget->hasSSE1()) + return "x"; + } + + return TargetLowering::LowerXConstraint(ConstraintVT); +} + +/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops +/// vector. If it is invalid, don't add anything to Ops. +void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, + std::vector<SDValue>&Ops, + SelectionDAG &DAG) const { + SDValue Result; + + // Only support length 1 constraints for now. + if (Constraint.length() > 1) return; + + char ConstraintLetter = Constraint[0]; + switch (ConstraintLetter) { + default: break; + case 'I': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() <= 31) { + Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), + Op.getValueType()); + break; + } + } + return; + case 'J': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() <= 63) { + Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), + Op.getValueType()); + break; + } + } + return; + case 'K': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (isInt<8>(C->getSExtValue())) { + Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), + Op.getValueType()); + break; + } + } + return; + case 'L': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff || + (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) { + Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), + Op.getValueType()); + break; + } + } + return; + case 'M': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() <= 3) { + Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), + Op.getValueType()); + break; + } + } + return; + case 'N': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() <= 255) { + Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), + Op.getValueType()); + break; + } + } + return; + case 'O': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() <= 127) { + Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), + Op.getValueType()); + break; + } + } + return; + case 'e': { + // 32-bit signed value + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), + C->getSExtValue())) { + // Widen to 64 bits here to get it sign extended. + Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64); + break; + } + // FIXME gcc accepts some relocatable values here too, but only in certain + // memory models; it's complicated. + } + return; + } + case 'Z': { + // 32-bit unsigned value + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), + C->getZExtValue())) { + Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), + Op.getValueType()); + break; + } + } + // FIXME gcc accepts some relocatable values here too, but only in certain + // memory models; it's complicated. + return; + } + case 'i': { + // Literal immediates are always ok. + if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { + // Widen to 64 bits here to get it sign extended. + Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64); + break; + } + + // In any sort of PIC mode addresses need to be computed at runtime by + // adding in a register or some sort of table lookup. These can't + // be used as immediates. + if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) + return; + + // If we are in non-pic codegen mode, we allow the address of a global (with + // an optional displacement) to be used with 'i'. + GlobalAddressSDNode *GA = nullptr; + int64_t Offset = 0; + + // Match either (GA), (GA+C), (GA+C1+C2), etc. + while (1) { + if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { + Offset += GA->getOffset(); + break; + } else if (Op.getOpcode() == ISD::ADD) { + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + Offset += C->getZExtValue(); + Op = Op.getOperand(0); + continue; + } + } else if (Op.getOpcode() == ISD::SUB) { + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + Offset += -C->getZExtValue(); + Op = Op.getOperand(0); + continue; + } + } + + // Otherwise, this isn't something we can handle, reject it. + return; + } + + const GlobalValue *GV = GA->getGlobal(); + // If we require an extra load to get this address, as in PIC mode, we + // can't accept it. + if (isGlobalStubReference( + Subtarget->ClassifyGlobalReference(GV, DAG.getTarget()))) + return; + + Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op), + GA->getValueType(0), Offset); + break; + } + } + + if (Result.getNode()) { + Ops.push_back(Result); + return; + } + return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); +} + +std::pair<unsigned, const TargetRegisterClass *> +X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + StringRef Constraint, + MVT VT) const { + // First, see if this is a constraint that directly corresponds to an LLVM + // register class. + if (Constraint.size() == 1) { + // GCC Constraint Letters + switch (Constraint[0]) { + default: break; + // TODO: Slight differences here in allocation order and leaving + // RIP in the class. Do they matter any more here than they do + // in the normal allocation? + case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. + if (Subtarget->is64Bit()) { + if (VT == MVT::i32 || VT == MVT::f32) + return std::make_pair(0U, &X86::GR32RegClass); + if (VT == MVT::i16) + return std::make_pair(0U, &X86::GR16RegClass); + if (VT == MVT::i8 || VT == MVT::i1) + return std::make_pair(0U, &X86::GR8RegClass); + if (VT == MVT::i64 || VT == MVT::f64) + return std::make_pair(0U, &X86::GR64RegClass); + break; + } + // 32-bit fallthrough + case 'Q': // Q_REGS + if (VT == MVT::i32 || VT == MVT::f32) + return std::make_pair(0U, &X86::GR32_ABCDRegClass); + if (VT == MVT::i16) + return std::make_pair(0U, &X86::GR16_ABCDRegClass); + if (VT == MVT::i8 || VT == MVT::i1) + return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); + if (VT == MVT::i64) + return std::make_pair(0U, &X86::GR64_ABCDRegClass); + break; + case 'r': // GENERAL_REGS + case 'l': // INDEX_REGS + if (VT == MVT::i8 || VT == MVT::i1) + return std::make_pair(0U, &X86::GR8RegClass); + if (VT == MVT::i16) + return std::make_pair(0U, &X86::GR16RegClass); + if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) + return std::make_pair(0U, &X86::GR32RegClass); + return std::make_pair(0U, &X86::GR64RegClass); + case 'R': // LEGACY_REGS + if (VT == MVT::i8 || VT == MVT::i1) + return std::make_pair(0U, &X86::GR8_NOREXRegClass); + if (VT == MVT::i16) + return std::make_pair(0U, &X86::GR16_NOREXRegClass); + if (VT == MVT::i32 || !Subtarget->is64Bit()) + return std::make_pair(0U, &X86::GR32_NOREXRegClass); + return std::make_pair(0U, &X86::GR64_NOREXRegClass); + case 'f': // FP Stack registers. + // If SSE is enabled for this VT, use f80 to ensure the isel moves the + // value to the correct fpstack register class. + if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) + return std::make_pair(0U, &X86::RFP32RegClass); + if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) + return std::make_pair(0U, &X86::RFP64RegClass); + return std::make_pair(0U, &X86::RFP80RegClass); + case 'y': // MMX_REGS if MMX allowed. + if (!Subtarget->hasMMX()) break; + return std::make_pair(0U, &X86::VR64RegClass); + case 'Y': // SSE_REGS if SSE2 allowed + if (!Subtarget->hasSSE2()) break; + // FALL THROUGH. + case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed + if (!Subtarget->hasSSE1()) break; + + switch (VT.SimpleTy) { + default: break; + // Scalar SSE types. + case MVT::f32: + case MVT::i32: + return std::make_pair(0U, &X86::FR32RegClass); + case MVT::f64: + case MVT::i64: + return std::make_pair(0U, &X86::FR64RegClass); + // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. + // Vector types. + case MVT::v16i8: + case MVT::v8i16: + case MVT::v4i32: + case MVT::v2i64: + case MVT::v4f32: + case MVT::v2f64: + return std::make_pair(0U, &X86::VR128RegClass); + // AVX types. + case MVT::v32i8: + case MVT::v16i16: + case MVT::v8i32: + case MVT::v4i64: + case MVT::v8f32: + case MVT::v4f64: + return std::make_pair(0U, &X86::VR256RegClass); + case MVT::v8f64: + case MVT::v16f32: + case MVT::v16i32: + case MVT::v8i64: + return std::make_pair(0U, &X86::VR512RegClass); + } + break; + } + } + + // Use the default implementation in TargetLowering to convert the register + // constraint into a member of a register class. + std::pair<unsigned, const TargetRegisterClass*> Res; + Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); + + // Not found as a standard register? + if (!Res.second) { + // Map st(0) -> st(7) -> ST0 + if (Constraint.size() == 7 && Constraint[0] == '{' && + tolower(Constraint[1]) == 's' && + tolower(Constraint[2]) == 't' && + Constraint[3] == '(' && + (Constraint[4] >= '0' && Constraint[4] <= '7') && + Constraint[5] == ')' && + Constraint[6] == '}') { + + Res.first = X86::FP0+Constraint[4]-'0'; + Res.second = &X86::RFP80RegClass; + return Res; + } + + // GCC allows "st(0)" to be called just plain "st". + if (StringRef("{st}").equals_lower(Constraint)) { + Res.first = X86::FP0; + Res.second = &X86::RFP80RegClass; + return Res; + } + + // flags -> EFLAGS + if (StringRef("{flags}").equals_lower(Constraint)) { + Res.first = X86::EFLAGS; + Res.second = &X86::CCRRegClass; + return Res; + } + + // 'A' means EAX + EDX. + if (Constraint == "A") { + Res.first = X86::EAX; + Res.second = &X86::GR32_ADRegClass; + return Res; + } + return Res; + } + + // Otherwise, check to see if this is a register class of the wrong value + // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to + // turn into {ax},{dx}. + // MVT::Other is used to specify clobber names. + if (Res.second->hasType(VT) || VT == MVT::Other) + return Res; // Correct type already, nothing to do. + + // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should + // return "eax". This should even work for things like getting 64bit integer + // registers when given an f64 type. + const TargetRegisterClass *Class = Res.second; + if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass || + Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) { + unsigned Size = VT.getSizeInBits(); + if (Size == 1) Size = 8; + unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size); + if (DestReg > 0) { + Res.first = DestReg; + Res.second = Size == 8 ? &X86::GR8RegClass + : Size == 16 ? &X86::GR16RegClass + : Size == 32 ? &X86::GR32RegClass + : &X86::GR64RegClass; + assert(Res.second->contains(Res.first) && "Register in register class"); + } else { + // No register found/type mismatch. + Res.first = 0; + Res.second = nullptr; + } + } else if (Class == &X86::FR32RegClass || Class == &X86::FR64RegClass || + Class == &X86::VR128RegClass || Class == &X86::VR256RegClass || + Class == &X86::FR32XRegClass || Class == &X86::FR64XRegClass || + Class == &X86::VR128XRegClass || Class == &X86::VR256XRegClass || + Class == &X86::VR512RegClass) { + // Handle references to XMM physical registers that got mapped into the + // wrong class. This can happen with constraints like {xmm0} where the + // target independent register mapper will just pick the first match it can + // find, ignoring the required type. + + // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. + if (VT == MVT::f32 || VT == MVT::i32) + Res.second = &X86::FR32RegClass; + else if (VT == MVT::f64 || VT == MVT::i64) + Res.second = &X86::FR64RegClass; + else if (X86::VR128RegClass.hasType(VT)) + Res.second = &X86::VR128RegClass; + else if (X86::VR256RegClass.hasType(VT)) + Res.second = &X86::VR256RegClass; + else if (X86::VR512RegClass.hasType(VT)) + Res.second = &X86::VR512RegClass; + else { + // Type mismatch and not a clobber: Return an error; + Res.first = 0; + Res.second = nullptr; + } + } + + return Res; +} + +int X86TargetLowering::getScalingFactorCost(const DataLayout &DL, + const AddrMode &AM, Type *Ty, + unsigned AS) const { + // Scaling factors are not free at all. + // An indexed folded instruction, i.e., inst (reg1, reg2, scale), + // will take 2 allocations in the out of order engine instead of 1 + // for plain addressing mode, i.e. inst (reg1). + // E.g., + // vaddps (%rsi,%drx), %ymm0, %ymm1 + // Requires two allocations (one for the load, one for the computation) + // whereas: + // vaddps (%rsi), %ymm0, %ymm1 + // Requires just 1 allocation, i.e., freeing allocations for other operations + // and having less micro operations to execute. + // + // For some X86 architectures, this is even worse because for instance for + // stores, the complex addressing mode forces the instruction to use the + // "load" ports instead of the dedicated "store" port. + // E.g., on Haswell: + // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. + // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. + if (isLegalAddressingMode(DL, AM, Ty, AS)) + // Scale represents reg2 * scale, thus account for 1 + // as soon as we use a second register. + return AM.Scale != 0; + return -1; +} + +bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { + // Integer division on x86 is expensive. However, when aggressively optimizing + // for code size, we prefer to use a div instruction, as it is usually smaller + // than the alternative sequence. + // The exception to this is vector division. Since x86 doesn't have vector + // integer division, leaving the division as-is is a loss even in terms of + // size, because it will have to be scalarized, while the alternative code + // sequence can be performed in vector form. + bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex, + Attribute::MinSize); + return OptSize && !VT.isVector(); +} diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h new file mode 100644 index 0000000..8bb0e5f --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h @@ -0,0 +1,1157 @@ +//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that X86 uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H +#define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H + +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetOptions.h" + +namespace llvm { + class X86Subtarget; + class X86TargetMachine; + + namespace X86ISD { + // X86 Specific DAG Nodes + enum NodeType : unsigned { + // Start the numbering where the builtin ops leave off. + FIRST_NUMBER = ISD::BUILTIN_OP_END, + + /// Bit scan forward. + BSF, + /// Bit scan reverse. + BSR, + + /// Double shift instructions. These correspond to + /// X86::SHLDxx and X86::SHRDxx instructions. + SHLD, + SHRD, + + /// Bitwise logical AND of floating point values. This corresponds + /// to X86::ANDPS or X86::ANDPD. + FAND, + + /// Bitwise logical OR of floating point values. This corresponds + /// to X86::ORPS or X86::ORPD. + FOR, + + /// Bitwise logical XOR of floating point values. This corresponds + /// to X86::XORPS or X86::XORPD. + FXOR, + + /// Bitwise logical ANDNOT of floating point values. This + /// corresponds to X86::ANDNPS or X86::ANDNPD. + FANDN, + + /// These operations represent an abstract X86 call + /// instruction, which includes a bunch of information. In particular the + /// operands of these node are: + /// + /// #0 - The incoming token chain + /// #1 - The callee + /// #2 - The number of arg bytes the caller pushes on the stack. + /// #3 - The number of arg bytes the callee pops off the stack. + /// #4 - The value to pass in AL/AX/EAX (optional) + /// #5 - The value to pass in DL/DX/EDX (optional) + /// + /// The result values of these nodes are: + /// + /// #0 - The outgoing token chain + /// #1 - The first register result value (optional) + /// #2 - The second register result value (optional) + /// + CALL, + + /// This operation implements the lowering for readcyclecounter + RDTSC_DAG, + + /// X86 Read Time-Stamp Counter and Processor ID. + RDTSCP_DAG, + + /// X86 Read Performance Monitoring Counters. + RDPMC_DAG, + + /// X86 compare and logical compare instructions. + CMP, COMI, UCOMI, + + /// X86 bit-test instructions. + BT, + + /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS + /// operand, usually produced by a CMP instruction. + SETCC, + + /// X86 Select + SELECT, + + // Same as SETCC except it's materialized with a sbb and the value is all + // one's or all zero's. + SETCC_CARRY, // R = carry_bit ? ~0 : 0 + + /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD. + /// Operands are two FP values to compare; result is a mask of + /// 0s or 1s. Generally DTRT for C/C++ with NaNs. + FSETCC, + + /// X86 MOVMSK{pd|ps}, extracts sign bits of two or four FP values, + /// result in an integer GPR. Needs masking for scalar result. + FGETSIGNx86, + + /// X86 conditional moves. Operand 0 and operand 1 are the two values + /// to select from. Operand 2 is the condition code, and operand 3 is the + /// flag operand produced by a CMP or TEST instruction. It also writes a + /// flag result. + CMOV, + + /// X86 conditional branches. Operand 0 is the chain operand, operand 1 + /// is the block to branch if condition is true, operand 2 is the + /// condition code, and operand 3 is the flag operand produced by a CMP + /// or TEST instruction. + BRCOND, + + /// Return with a flag operand. Operand 0 is the chain operand, operand + /// 1 is the number of bytes of stack to pop. + RET_FLAG, + + /// Return from interrupt. Operand 0 is the number of bytes to pop. + IRET, + + /// Repeat fill, corresponds to X86::REP_STOSx. + REP_STOS, + + /// Repeat move, corresponds to X86::REP_MOVSx. + REP_MOVS, + + /// On Darwin, this node represents the result of the popl + /// at function entry, used for PIC code. + GlobalBaseReg, + + /// A wrapper node for TargetConstantPool, + /// TargetExternalSymbol, and TargetGlobalAddress. + Wrapper, + + /// Special wrapper used under X86-64 PIC mode for RIP + /// relative displacements. + WrapperRIP, + + /// Copies a 64-bit value from the low word of an XMM vector + /// to an MMX vector. If you think this is too close to the previous + /// mnemonic, so do I; blame Intel. + MOVDQ2Q, + + /// Copies a 32-bit value from the low word of a MMX + /// vector to a GPR. + MMX_MOVD2W, + + /// Copies a GPR into the low 32-bit word of a MMX vector + /// and zero out the high word. + MMX_MOVW2D, + + /// Extract an 8-bit value from a vector and zero extend it to + /// i32, corresponds to X86::PEXTRB. + PEXTRB, + + /// Extract a 16-bit value from a vector and zero extend it to + /// i32, corresponds to X86::PEXTRW. + PEXTRW, + + /// Insert any element of a 4 x float vector into any element + /// of a destination 4 x floatvector. + INSERTPS, + + /// Insert the lower 8-bits of a 32-bit value to a vector, + /// corresponds to X86::PINSRB. + PINSRB, + + /// Insert the lower 16-bits of a 32-bit value to a vector, + /// corresponds to X86::PINSRW. + PINSRW, MMX_PINSRW, + + /// Shuffle 16 8-bit values within a vector. + PSHUFB, + + /// Compute Sum of Absolute Differences. + PSADBW, + /// Compute Double Block Packed Sum-Absolute-Differences + DBPSADBW, + + /// Bitwise Logical AND NOT of Packed FP values. + ANDNP, + + /// Copy integer sign. + PSIGN, + + /// Blend where the selector is an immediate. + BLENDI, + + /// Blend where the condition has been shrunk. + /// This is used to emphasize that the condition mask is + /// no more valid for generic VSELECT optimizations. + SHRUNKBLEND, + + /// Combined add and sub on an FP vector. + ADDSUB, + + // FP vector ops with rounding mode. + FADD_RND, + FSUB_RND, + FMUL_RND, + FDIV_RND, + FMAX_RND, + FMIN_RND, + FSQRT_RND, + + // FP vector get exponent + FGETEXP_RND, + // Extract Normalized Mantissas + VGETMANT, + // FP Scale + SCALEF, + // Integer add/sub with unsigned saturation. + ADDUS, + SUBUS, + // Integer add/sub with signed saturation. + ADDS, + SUBS, + // Unsigned Integer average + AVG, + /// Integer horizontal add. + HADD, + + /// Integer horizontal sub. + HSUB, + + /// Floating point horizontal add. + FHADD, + + /// Floating point horizontal sub. + FHSUB, + + // Integer absolute value + ABS, + + // Detect Conflicts Within a Vector + CONFLICT, + + /// Floating point max and min. + FMAX, FMIN, + + /// Commutative FMIN and FMAX. + FMAXC, FMINC, + + /// Floating point reciprocal-sqrt and reciprocal approximation. + /// Note that these typically require refinement + /// in order to obtain suitable precision. + FRSQRT, FRCP, + + // Thread Local Storage. + TLSADDR, + + // Thread Local Storage. A call to get the start address + // of the TLS block for the current module. + TLSBASEADDR, + + // Thread Local Storage. When calling to an OS provided + // thunk at the address from an earlier relocation. + TLSCALL, + + // Exception Handling helpers. + EH_RETURN, + + // SjLj exception handling setjmp. + EH_SJLJ_SETJMP, + + // SjLj exception handling longjmp. + EH_SJLJ_LONGJMP, + + /// Tail call return. See X86TargetLowering::LowerCall for + /// the list of operands. + TC_RETURN, + + // Vector move to low scalar and zero higher vector elements. + VZEXT_MOVL, + + // Vector integer zero-extend. + VZEXT, + + // Vector integer signed-extend. + VSEXT, + + // Vector integer truncate. + VTRUNC, + // Vector integer truncate with unsigned/signed saturation. + VTRUNCUS, VTRUNCS, + + // Vector FP extend. + VFPEXT, + + // Vector FP round. + VFPROUND, + + // Vector signed/unsigned integer to double. + CVTDQ2PD, CVTUDQ2PD, + + // Convert a vector to mask, set bits base on MSB. + CVT2MASK, + + // 128-bit vector logical left / right shift + VSHLDQ, VSRLDQ, + + // Vector shift elements + VSHL, VSRL, VSRA, + + // Vector shift elements by immediate + VSHLI, VSRLI, VSRAI, + + // Vector packed double/float comparison. + CMPP, + + // Vector integer comparisons. + PCMPEQ, PCMPGT, + // Vector integer comparisons, the result is in a mask vector. + PCMPEQM, PCMPGTM, + + /// Vector comparison generating mask bits for fp and + /// integer signed and unsigned data types. + CMPM, + CMPMU, + // Vector comparison with rounding mode for FP values + CMPM_RND, + + // Arithmetic operations with FLAGS results. + ADD, SUB, ADC, SBB, SMUL, + INC, DEC, OR, XOR, AND, + + BEXTR, // Bit field extract + + UMUL, // LOW, HI, FLAGS = umul LHS, RHS + + // 8-bit SMUL/UMUL - AX, FLAGS = smul8/umul8 AL, RHS + SMUL8, UMUL8, + + // 8-bit divrem that zero-extend the high result (AH). + UDIVREM8_ZEXT_HREG, + SDIVREM8_SEXT_HREG, + + // X86-specific multiply by immediate. + MUL_IMM, + + // Vector bitwise comparisons. + PTEST, + + // Vector packed fp sign bitwise comparisons. + TESTP, + + // Vector "test" in AVX-512, the result is in a mask vector. + TESTM, + TESTNM, + + // OR/AND test for masks + KORTEST, + KTEST, + + // Several flavors of instructions with vector shuffle behaviors. + PACKSS, + PACKUS, + // Intra-lane alignr + PALIGNR, + // AVX512 inter-lane alignr + VALIGN, + PSHUFD, + PSHUFHW, + PSHUFLW, + SHUFP, + //Shuffle Packed Values at 128-bit granularity + SHUF128, + MOVDDUP, + MOVSHDUP, + MOVSLDUP, + MOVLHPS, + MOVLHPD, + MOVHLPS, + MOVLPS, + MOVLPD, + MOVSD, + MOVSS, + UNPCKL, + UNPCKH, + VPERMILPV, + VPERMILPI, + VPERMV, + VPERMV3, + VPERMIV3, + VPERMI, + VPERM2X128, + // Bitwise ternary logic + VPTERNLOG, + // Fix Up Special Packed Float32/64 values + VFIXUPIMM, + // Range Restriction Calculation For Packed Pairs of Float32/64 values + VRANGE, + // Reduce - Perform Reduction Transformation on scalar\packed FP + VREDUCE, + // RndScale - Round FP Values To Include A Given Number Of Fraction Bits + VRNDSCALE, + // VFPCLASS - Tests Types Of a FP Values for packed types. + VFPCLASS, + // VFPCLASSS - Tests Types Of a FP Values for scalar types. + VFPCLASSS, + // Broadcast scalar to vector + VBROADCAST, + // Broadcast mask to vector + VBROADCASTM, + // Broadcast subvector to vector + SUBV_BROADCAST, + // Insert/Extract vector element + VINSERT, + VEXTRACT, + + /// SSE4A Extraction and Insertion. + EXTRQI, INSERTQI, + + // XOP variable/immediate rotations + VPROT, VPROTI, + // XOP arithmetic/logical shifts + VPSHA, VPSHL, + // XOP signed/unsigned integer comparisons + VPCOM, VPCOMU, + + // Vector multiply packed unsigned doubleword integers + PMULUDQ, + // Vector multiply packed signed doubleword integers + PMULDQ, + // Vector Multiply Packed UnsignedIntegers with Round and Scale + MULHRS, + // Multiply and Add Packed Integers + VPMADDUBSW, VPMADDWD, + // FMA nodes + FMADD, + FNMADD, + FMSUB, + FNMSUB, + FMADDSUB, + FMSUBADD, + // FMA with rounding mode + FMADD_RND, + FNMADD_RND, + FMSUB_RND, + FNMSUB_RND, + FMADDSUB_RND, + FMSUBADD_RND, + + // Compress and expand + COMPRESS, + EXPAND, + + //Convert Unsigned/Integer to Scalar Floating-Point Value + //with rounding mode + SINT_TO_FP_RND, + UINT_TO_FP_RND, + + // Vector float/double to signed/unsigned integer. + FP_TO_SINT_RND, FP_TO_UINT_RND, + // Save xmm argument registers to the stack, according to %al. An operator + // is needed so that this can be expanded with control flow. + VASTART_SAVE_XMM_REGS, + + // Windows's _chkstk call to do stack probing. + WIN_ALLOCA, + + // For allocating variable amounts of stack space when using + // segmented stacks. Check if the current stacklet has enough space, and + // falls back to heap allocation if not. + SEG_ALLOCA, + + // Memory barrier + MEMBARRIER, + MFENCE, + SFENCE, + LFENCE, + + // Store FP status word into i16 register. + FNSTSW16r, + + // Store contents of %ah into %eflags. + SAHF, + + // Get a random integer and indicate whether it is valid in CF. + RDRAND, + + // Get a NIST SP800-90B & C compliant random integer and + // indicate whether it is valid in CF. + RDSEED, + + PCMPISTRI, + PCMPESTRI, + + // Test if in transactional execution. + XTEST, + + // ERI instructions + RSQRT28, RCP28, EXP2, + + // Compare and swap. + LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, + LCMPXCHG8_DAG, + LCMPXCHG16_DAG, + + // Load, scalar_to_vector, and zero extend. + VZEXT_LOAD, + + // Store FP control world into i16 memory. + FNSTCW16m, + + /// This instruction implements FP_TO_SINT with the + /// integer destination in memory and a FP reg source. This corresponds + /// to the X86::FIST*m instructions and the rounding mode change stuff. It + /// has two inputs (token chain and address) and two outputs (int value + /// and token chain). + FP_TO_INT16_IN_MEM, + FP_TO_INT32_IN_MEM, + FP_TO_INT64_IN_MEM, + + /// This instruction implements SINT_TO_FP with the + /// integer source in memory and FP reg result. This corresponds to the + /// X86::FILD*m instructions. It has three inputs (token chain, address, + /// and source type) and two outputs (FP value and token chain). FILD_FLAG + /// also produces a flag). + FILD, + FILD_FLAG, + + /// This instruction implements an extending load to FP stack slots. + /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain + /// operand, ptr to load from, and a ValueType node indicating the type + /// to load to. + FLD, + + /// This instruction implements a truncating store to FP stack + /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a + /// chain operand, value to store, address, and a ValueType to store it + /// as. + FST, + + /// This instruction grabs the address of the next argument + /// from a va_list. (reads and modifies the va_list in memory) + VAARG_64 + + // WARNING: Do not add anything in the end unless you want the node to + // have memop! In fact, starting from ATOMADD64_DAG all opcodes will be + // thought as target memory ops! + }; + } + + /// Define some predicates that are used for node matching. + namespace X86 { + /// Return true if the specified + /// EXTRACT_SUBVECTOR operand specifies a vector extract that is + /// suitable for input to VEXTRACTF128, VEXTRACTI128 instructions. + bool isVEXTRACT128Index(SDNode *N); + + /// Return true if the specified + /// INSERT_SUBVECTOR operand specifies a subvector insert that is + /// suitable for input to VINSERTF128, VINSERTI128 instructions. + bool isVINSERT128Index(SDNode *N); + + /// Return true if the specified + /// EXTRACT_SUBVECTOR operand specifies a vector extract that is + /// suitable for input to VEXTRACTF64X4, VEXTRACTI64X4 instructions. + bool isVEXTRACT256Index(SDNode *N); + + /// Return true if the specified + /// INSERT_SUBVECTOR operand specifies a subvector insert that is + /// suitable for input to VINSERTF64X4, VINSERTI64X4 instructions. + bool isVINSERT256Index(SDNode *N); + + /// Return the appropriate + /// immediate to extract the specified EXTRACT_SUBVECTOR index + /// with VEXTRACTF128, VEXTRACTI128 instructions. + unsigned getExtractVEXTRACT128Immediate(SDNode *N); + + /// Return the appropriate + /// immediate to insert at the specified INSERT_SUBVECTOR index + /// with VINSERTF128, VINSERT128 instructions. + unsigned getInsertVINSERT128Immediate(SDNode *N); + + /// Return the appropriate + /// immediate to extract the specified EXTRACT_SUBVECTOR index + /// with VEXTRACTF64X4, VEXTRACTI64x4 instructions. + unsigned getExtractVEXTRACT256Immediate(SDNode *N); + + /// Return the appropriate + /// immediate to insert at the specified INSERT_SUBVECTOR index + /// with VINSERTF64x4, VINSERTI64x4 instructions. + unsigned getInsertVINSERT256Immediate(SDNode *N); + + /// Returns true if Elt is a constant zero or floating point constant +0.0. + bool isZeroNode(SDValue Elt); + + /// Returns true of the given offset can be + /// fit into displacement field of the instruction. + bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, + bool hasSymbolicDisplacement = true); + + + /// Determines whether the callee is required to pop its + /// own arguments. Callee pop is necessary to support tail calls. + bool isCalleePop(CallingConv::ID CallingConv, + bool is64Bit, bool IsVarArg, bool TailCallOpt); + + } + + //===--------------------------------------------------------------------===// + // X86 Implementation of the TargetLowering interface + class X86TargetLowering final : public TargetLowering { + public: + explicit X86TargetLowering(const X86TargetMachine &TM, + const X86Subtarget &STI); + + unsigned getJumpTableEncoding() const override; + bool useSoftFloat() const override; + + MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override { + return MVT::i8; + } + + const MCExpr * + LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, + const MachineBasicBlock *MBB, unsigned uid, + MCContext &Ctx) const override; + + /// Returns relocation base for the given PIC jumptable. + SDValue getPICJumpTableRelocBase(SDValue Table, + SelectionDAG &DAG) const override; + const MCExpr * + getPICJumpTableRelocBaseExpr(const MachineFunction *MF, + unsigned JTI, MCContext &Ctx) const override; + + /// Return the desired alignment for ByVal aggregate + /// function arguments in the caller parameter area. For X86, aggregates + /// that contains are placed at 16-byte boundaries while the rest are at + /// 4-byte boundaries. + unsigned getByValTypeAlignment(Type *Ty, + const DataLayout &DL) const override; + + /// Returns the target specific optimal type for load + /// and store operations as a result of memset, memcpy, and memmove + /// lowering. If DstAlign is zero that means it's safe to destination + /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it + /// means there isn't a need to check it against alignment requirement, + /// probably because the source does not need to be loaded. If 'IsMemset' is + /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that + /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy + /// source is constant so it does not need to be loaded. + /// It returns EVT::Other if the type should be determined using generic + /// target-independent logic. + EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, + bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, + MachineFunction &MF) const override; + + /// Returns true if it's safe to use load / store of the + /// specified type to expand memcpy / memset inline. This is mostly true + /// for all types except for some special cases. For example, on X86 + /// targets without SSE2 f64 load / store are done with fldl / fstpl which + /// also does type conversion. Note the specified type doesn't have to be + /// legal as the hook is used before type legalization. + bool isSafeMemOpType(MVT VT) const override; + + /// Returns true if the target allows unaligned memory accesses of the + /// specified type. Returns whether it is "fast" in the last argument. + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, + bool *Fast) const override; + + /// Provide custom lowering hooks for some operations. + /// + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + + /// Replace the results of node with an illegal result + /// type with new values built out of custom code. + /// + void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results, + SelectionDAG &DAG) const override; + + + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + + /// Return true if the target has native support for + /// the specified value type and it is 'desirable' to use the type for the + /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 + /// instruction encodings are longer and some i16 instructions are slow. + bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override; + + /// Return true if the target has native support for the + /// specified value type and it is 'desirable' to use the type. e.g. On x86 + /// i16 is legal, but undesirable since i16 instruction encodings are longer + /// and some i16 instructions are slow. + bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override; + + /// Return true if the MachineFunction contains a COPY which would imply + /// HasOpaqueSPAdjustment. + bool hasCopyImplyingStackAdjustment(MachineFunction *MF) const override; + + MachineBasicBlock * + EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *MBB) const override; + + + /// This method returns the name of a target specific DAG node. + const char *getTargetNodeName(unsigned Opcode) const override; + + bool isCheapToSpeculateCttz() const override; + + bool isCheapToSpeculateCtlz() const override; + + /// Return the value type to use for ISD::SETCC. + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, + EVT VT) const override; + + /// Determine which of the bits specified in Mask are known to be either + /// zero or one and return them in the KnownZero/KnownOne bitsets. + void computeKnownBitsForTargetNode(const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth = 0) const override; + + /// Determine the number of bits in the operation that are sign bits. + unsigned ComputeNumSignBitsForTargetNode(SDValue Op, + const SelectionDAG &DAG, + unsigned Depth) const override; + + bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA, + int64_t &Offset) const override; + + SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const; + + bool ExpandInlineAsm(CallInst *CI) const override; + + ConstraintType getConstraintType(StringRef Constraint) const override; + + /// Examine constraint string and operand type and determine a weight value. + /// The operand object must already have been set up with the operand type. + ConstraintWeight + getSingleConstraintMatchWeight(AsmOperandInfo &info, + const char *constraint) const override; + + const char *LowerXConstraint(EVT ConstraintVT) const override; + + /// Lower the specified operand into the Ops vector. If it is invalid, don't + /// add anything to Ops. If hasMemory is true it means one of the asm + /// constraint of the inline asm instruction being processed is 'm'. + void LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, + std::vector<SDValue> &Ops, + SelectionDAG &DAG) const override; + + unsigned + getInlineAsmMemConstraint(StringRef ConstraintCode) const override { + if (ConstraintCode == "i") + return InlineAsm::Constraint_i; + else if (ConstraintCode == "o") + return InlineAsm::Constraint_o; + else if (ConstraintCode == "v") + return InlineAsm::Constraint_v; + else if (ConstraintCode == "X") + return InlineAsm::Constraint_X; + return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); + } + + /// Given a physical register constraint + /// (e.g. {edx}), return the register number and the register class for the + /// register. This should only be used for C_Register constraints. On + /// error, this returns a register number of 0. + std::pair<unsigned, const TargetRegisterClass *> + getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + StringRef Constraint, MVT VT) const override; + + /// Return true if the addressing mode represented + /// by AM is legal for this target, for a load/store of the specified type. + bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, + Type *Ty, unsigned AS) const override; + + /// Return true if the specified immediate is legal + /// icmp immediate, that is the target has icmp instructions which can + /// compare a register against the immediate without having to materialize + /// the immediate into a register. + bool isLegalICmpImmediate(int64_t Imm) const override; + + /// Return true if the specified immediate is legal + /// add immediate, that is the target has add instructions which can + /// add a register and the immediate without having to materialize + /// the immediate into a register. + bool isLegalAddImmediate(int64_t Imm) const override; + + /// \brief Return the cost of the scaling factor used in the addressing + /// mode represented by AM for this target, for a load/store + /// of the specified type. + /// If the AM is supported, the return value must be >= 0. + /// If the AM is not supported, it returns a negative value. + int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, + unsigned AS) const override; + + bool isVectorShiftByScalarCheap(Type *Ty) const override; + + /// Return true if it's free to truncate a value of + /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in + /// register EAX to i16 by referencing its sub-register AX. + bool isTruncateFree(Type *Ty1, Type *Ty2) const override; + bool isTruncateFree(EVT VT1, EVT VT2) const override; + + bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override; + + /// Return true if any actual instruction that defines a + /// value of type Ty1 implicit zero-extends the value to Ty2 in the result + /// register. This does not necessarily include registers defined in + /// unknown ways, such as incoming arguments, or copies from unknown + /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this + /// does not necessarily apply to truncate instructions. e.g. on x86-64, + /// all instructions that define 32-bit values implicit zero-extend the + /// result out to 64 bits. + bool isZExtFree(Type *Ty1, Type *Ty2) const override; + bool isZExtFree(EVT VT1, EVT VT2) const override; + bool isZExtFree(SDValue Val, EVT VT2) const override; + + /// Return true if folding a vector load into ExtVal (a sign, zero, or any + /// extend node) is profitable. + bool isVectorLoadExtDesirable(SDValue) const override; + + /// Return true if an FMA operation is faster than a pair of fmul and fadd + /// instructions. fmuladd intrinsics will be expanded to FMAs when this + /// method returns true, otherwise fmuladd is expanded to fmul + fadd. + bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; + + /// Return true if it's profitable to narrow + /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow + /// from i32 to i8 but not from i32 to i16. + bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; + + /// Returns true if the target can instruction select the + /// specified FP immediate natively. If false, the legalizer will + /// materialize the FP immediate as a load from a constant pool. + bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + + /// Targets can use this to indicate that they only support *some* + /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a + /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to + /// be legal. + bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask, + EVT VT) const override; + + /// Similar to isShuffleMaskLegal. This is used by Targets can use this to + /// indicate if there is a suitable VECTOR_SHUFFLE that can be used to + /// replace a VAND with a constant pool entry. + bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, + EVT VT) const override; + + /// If true, then instruction selection should + /// seek to shrink the FP constant of the specified type to a smaller type + /// in order to save space and / or reduce runtime. + bool ShouldShrinkFPConstant(EVT VT) const override { + // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more + // expensive than a straight movsd. On the other hand, it's important to + // shrink long double fp constant since fldt is very slow. + return !X86ScalarSSEf64 || VT == MVT::f80; + } + + /// Return true if we believe it is correct and profitable to reduce the + /// load node to a smaller type. + bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, + EVT NewVT) const override; + + /// Return true if the specified scalar FP type is computed in an SSE + /// register, not on the X87 floating point stack. + bool isScalarFPTypeInSSEReg(EVT VT) const { + return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 + (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 + } + + /// \brief Returns true if it is beneficial to convert a load of a constant + /// to just the constant itself. + bool shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const override; + + /// Return true if EXTRACT_SUBVECTOR is cheap for this result type + /// with this index. + bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override; + + /// Intel processors have a unified instruction and data cache + const char * getClearCacheBuiltinName() const override { + return nullptr; // nothing to do, move along. + } + + unsigned getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const override; + + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override; + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override; + + /// This method returns a target specific FastISel object, + /// or null if the target does not support "fast" ISel. + FastISel *createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) const override; + + /// Return true if the target stores stack protector cookies at a fixed + /// offset in some non-standard address space, and populates the address + /// space and offset as appropriate. + bool getStackCookieLocation(unsigned &AddressSpace, + unsigned &Offset) const override; + + /// Return true if the target stores SafeStack pointer at a fixed offset in + /// some non-standard address space, and populates the address space and + /// offset as appropriate. + Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override; + + SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, + SelectionDAG &DAG) const; + + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; + + bool useLoadStackGuardNode() const override; + /// \brief Customize the preferred legalization strategy for certain types. + LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; + + bool isIntDivCheap(EVT VT, AttributeSet Attr) const override; + + protected: + std::pair<const TargetRegisterClass *, uint8_t> + findRepresentativeClass(const TargetRegisterInfo *TRI, + MVT VT) const override; + + private: + /// Keep a pointer to the X86Subtarget around so that we can + /// make the right decision when generating code for different targets. + const X86Subtarget *Subtarget; + + /// Select between SSE or x87 floating point ops. + /// When SSE is available, use it for f32 operations. + /// When SSE2 is available, use it for f64 operations. + bool X86ScalarSSEf32; + bool X86ScalarSSEf64; + + /// A list of legal FP immediates. + std::vector<APFloat> LegalFPImmediates; + + /// Indicate that this x86 target can instruction + /// select the specified FP immediate natively. + void addLegalFPImmediate(const APFloat& Imm) { + LegalFPImmediates.push_back(Imm); + } + + SDValue LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + SDValue LowerMemArgument(SDValue Chain, + CallingConv::ID CallConv, + const SmallVectorImpl<ISD::InputArg> &ArgInfo, + SDLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, MachineFrameInfo *MFI, + unsigned i) const; + SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, + SDLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags) const; + + // Call lowering helpers. + + /// Check whether the call is eligible for tail call optimization. Targets + /// that want to do tail call optimization should implement this function. + bool IsEligibleForTailCallOptimization(SDValue Callee, + CallingConv::ID CalleeCC, + bool isVarArg, + bool isCalleeStructRet, + bool isCallerStructRet, + Type *RetTy, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SelectionDAG& DAG) const; + SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, + SDValue Chain, bool IsTailCall, bool Is64Bit, + int FPDiff, SDLoc dl) const; + + unsigned GetAlignedArgumentStackSize(unsigned StackSize, + SelectionDAG &DAG) const; + + std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, + bool isSigned, + bool isReplace) const; + + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const; + SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, + int64_t Offset, SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerToBT(SDValue And, ISD::CondCode CC, + SDLoc dl, SelectionDAG &DAG) const; + SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const override; + SDValue LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const override; + + SDValue LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc dl, SelectionDAG &DAG) const override; + + bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; + + bool mayBeEmittedAsTailCall(CallInst *CI) const override; + + EVT getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, + ISD::NodeType ExtendKind) const override; + + bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const override; + + const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; + + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicLoadInIR(LoadInst *SI) const override; + bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + + LoadInst * + lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; + + bool needsCmpXchgNb(Type *MemType) const; + + // Utility function to emit the low-level va_arg code for X86-64. + MachineBasicBlock *EmitVAARG64WithCustomInserter( + MachineInstr *MI, + MachineBasicBlock *MBB) const; + + /// Utility function to emit the xmm reg save portion of va_start. + MachineBasicBlock *EmitVAStartSaveXMMRegsWithCustomInserter( + MachineInstr *BInstr, + MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitLoweredSelect(MachineInstr *I, + MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr *I, + MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI, + MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitLoweredCatchRet(MachineInstr *MI, + MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitLoweredCatchPad(MachineInstr *MI, + MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI, + MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI, + MachineBasicBlock *BB) const; + + MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const; + + MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const; + + MachineBasicBlock *emitFMA3Instr(MachineInstr *MI, + MachineBasicBlock *MBB) const; + + /// Emit nodes that will be selected as "test Op0,Op0", or something + /// equivalent, for use with the given x86 condition code. + SDValue EmitTest(SDValue Op0, unsigned X86CC, SDLoc dl, + SelectionDAG &DAG) const; + + /// Emit nodes that will be selected as "cmp Op0,Op1", or something + /// equivalent, for use with the given x86 condition code. + SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SDLoc dl, + SelectionDAG &DAG) const; + + /// Convert a comparison if required by the subtarget. + SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const; + + /// Use rsqrt* to speed up sqrt calculations. + SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const override; + + /// Use rcp* to speed up fdiv calculations. + SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const override; + + /// Reassociate floating point divisions into multiply by reciprocal. + unsigned combineRepeatedFPDivisors() const override; + }; + + namespace X86 { + FastISel *createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo); + } +} + +#endif // X86ISELLOWERING_H diff --git a/contrib/llvm/lib/Target/X86/X86Instr3DNow.td b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td new file mode 100644 index 0000000..ba1aede --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td @@ -0,0 +1,103 @@ +//===-- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the 3DNow! instruction set, which extends MMX to support +// floating point and also adds a few more random instructions for good measure. +// +//===----------------------------------------------------------------------===// + +class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat> + : I<o, F, outs, ins, asm, pat>, TB, Requires<[Has3DNow]> { +} + +class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat> + : I3DNow<o, F, (outs VR64:$dst), ins, + !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>, + Has3DNow0F0FOpcode { + // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet. + let isAsmParserOnly = 1; + let Constraints = "$src1 = $dst"; +} + +class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat> + : I3DNow<o, F, (outs VR64:$dst), ins, + !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>, + Has3DNow0F0FOpcode { + // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet. + let isAsmParserOnly = 1; +} + +multiclass I3DNow_binop_rm<bits<8> opc, string Mn> { + def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, []>; + def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>; +} + +multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> { + def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>; + def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, + (bitconvert (load_mmx addr:$src2))))]>; +} + +multiclass I3DNow_conv_rm<bits<8> opc, string Mn> { + def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src1), Mn, []>; + def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src1), Mn, []>; +} + +multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> { + def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>; + def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) + (bitconvert (load_mmx addr:$src))))]>; +} + +defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb">; +defm PF2ID : I3DNow_conv_rm_int<0x1D, "pf2id">; +defm PFACC : I3DNow_binop_rm_int<0xAE, "pfacc">; +defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd">; +defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq">; +defm PFCMPGE : I3DNow_binop_rm_int<0x90, "pfcmpge">; +defm PFCMPGT : I3DNow_binop_rm_int<0xA0, "pfcmpgt">; +defm PFMAX : I3DNow_binop_rm_int<0xA4, "pfmax">; +defm PFMIN : I3DNow_binop_rm_int<0x94, "pfmin">; +defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul">; +defm PFRCP : I3DNow_conv_rm_int<0x96, "pfrcp">; +defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">; +defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">; +defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">; +defm PFRSQRT : I3DNow_conv_rm_int<0x97, "pfrsqrt">; +defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub">; +defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr">; +defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd">; +defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw">; + + +def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", + [(int_x86_mmx_femms)]>; + +def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr), + "prefetch\t$addr", + [(prefetch addr:$addr, (i32 0), imm, (i32 1))]>; + +def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr", + [(prefetch addr:$addr, (i32 1), (i32 3), (i32 1))]>, TB, + Requires<[HasPrefetchW]>; + +// "3DNowA" instructions +defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">; +defm PI2FW : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">; +defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">; +defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">; +defm PSWAPD : I3DNow_conv_rm_int<0xBB, "pswapd", "a">; diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td new file mode 100644 index 0000000..0a27c33 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td @@ -0,0 +1,7519 @@ +//===-- X86InstrAVX512.td - AVX512 Instruction Set ---------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 AVX512 instruction set, defining the +// instructions, and properties of the instructions which are needed for code +// generation, machine code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +// Group template arguments that can be derived from the vector type (EltNum x +// EltVT). These are things like the register class for the writemask, etc. +// The idea is to pass one of these as the template argument rather than the +// individual arguments. +// The template is also used for scalar types, in this case numelts is 1. +class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, + string suffix = ""> { + RegisterClass RC = rc; + ValueType EltVT = eltvt; + int NumElts = numelts; + + // Corresponding mask register class. + RegisterClass KRC = !cast<RegisterClass>("VK" # NumElts); + + // Corresponding write-mask register class. + RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM"); + + // The GPR register class that can hold the write mask. Use GR8 for fewer + // than 8 elements. Use shift-right and equal to work around the lack of + // !lt in tablegen. + RegisterClass MRC = + !cast<RegisterClass>("GR" # + !if (!eq (!srl(NumElts, 3), 0), 8, NumElts)); + + // Suffix used in the instruction mnemonic. + string Suffix = suffix; + + // VTName is a string name for vector VT. For vector types it will be + // v # NumElts # EltVT, so for vector of 8 elements of i32 it will be v8i32 + // It is a little bit complex for scalar types, where NumElts = 1. + // In this case we build v4f32 or v2f64 + string VTName = "v" # !if (!eq (NumElts, 1), + !if (!eq (EltVT.Size, 32), 4, + !if (!eq (EltVT.Size, 64), 2, NumElts)), NumElts) # EltVT; + + // The vector VT. + ValueType VT = !cast<ValueType>(VTName); + + string EltTypeName = !cast<string>(EltVT); + // Size of the element type in bits, e.g. 32 for v16i32. + string EltSizeName = !subst("i", "", !subst("f", "", EltTypeName)); + int EltSize = EltVT.Size; + + // "i" for integer types and "f" for floating-point types + string TypeVariantName = !subst(EltSizeName, "", EltTypeName); + + // Size of RC in bits, e.g. 512 for VR512. + int Size = VT.Size; + + // The corresponding memory operand, e.g. i512mem for VR512. + X86MemOperand MemOp = !cast<X86MemOperand>(TypeVariantName # Size # "mem"); + X86MemOperand ScalarMemOp = !cast<X86MemOperand>(EltVT # "mem"); + + // Load patterns + // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64 + // due to load promotion during legalization + PatFrag LdFrag = !cast<PatFrag>("load" # + !if (!eq (TypeVariantName, "i"), + !if (!eq (Size, 128), "v2i64", + !if (!eq (Size, 256), "v4i64", + VTName)), VTName)); + + PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" # + !if (!eq (TypeVariantName, "i"), + !if (!eq (Size, 128), "v2i64", + !if (!eq (Size, 256), "v4i64", + !if (!eq (Size, 512), + !if (!eq (EltSize, 64), "v8i64", "v16i32"), + VTName))), VTName)); + + PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT); + + // The corresponding float type, e.g. v16f32 for v16i32 + // Note: For EltSize < 32, FloatVT is illegal and TableGen + // fails to compile, so we choose FloatVT = VT + ValueType FloatVT = !cast<ValueType>( + !if (!eq (!srl(EltSize,5),0), + VTName, + !if (!eq(TypeVariantName, "i"), + "v" # NumElts # "f" # EltSize, + VTName))); + + // The string to specify embedded broadcast in assembly. + string BroadcastStr = "{1to" # NumElts # "}"; + + // 8-bit compressed displacement tuple/subvector format. This is only + // defined for NumElts <= 8. + CD8VForm CD8TupleForm = !if (!eq (!srl(NumElts, 4), 0), + !cast<CD8VForm>("CD8VT" # NumElts), ?); + + SubRegIndex SubRegIdx = !if (!eq (Size, 128), sub_xmm, + !if (!eq (Size, 256), sub_ymm, ?)); + + Domain ExeDomain = !if (!eq (EltTypeName, "f32"), SSEPackedSingle, + !if (!eq (EltTypeName, "f64"), SSEPackedDouble, + SSEPackedInt)); + + RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X); + + // A vector type of the same width with element type i32. This is used to + // create the canonical constant zero node ImmAllZerosV. + ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32"); + dag ImmAllZerosV = (VT (bitconvert (i32VT immAllZerosV))); + + string ZSuffix = !if (!eq (Size, 128), "Z128", + !if (!eq (Size, 256), "Z256", "Z")); +} + +def v64i8_info : X86VectorVTInfo<64, i8, VR512, "b">; +def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">; +def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">; +def v8i64_info : X86VectorVTInfo<8, i64, VR512, "q">; +def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">; +def v8f64_info : X86VectorVTInfo<8, f64, VR512, "pd">; + +// "x" in v32i8x_info means RC = VR256X +def v32i8x_info : X86VectorVTInfo<32, i8, VR256X, "b">; +def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">; +def v8i32x_info : X86VectorVTInfo<8, i32, VR256X, "d">; +def v4i64x_info : X86VectorVTInfo<4, i64, VR256X, "q">; +def v8f32x_info : X86VectorVTInfo<8, f32, VR256X, "ps">; +def v4f64x_info : X86VectorVTInfo<4, f64, VR256X, "pd">; + +def v16i8x_info : X86VectorVTInfo<16, i8, VR128X, "b">; +def v8i16x_info : X86VectorVTInfo<8, i16, VR128X, "w">; +def v4i32x_info : X86VectorVTInfo<4, i32, VR128X, "d">; +def v2i64x_info : X86VectorVTInfo<2, i64, VR128X, "q">; +def v4f32x_info : X86VectorVTInfo<4, f32, VR128X, "ps">; +def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">; + +// We map scalar types to the smallest (128-bit) vector type +// with the appropriate element type. This allows to use the same masking logic. +def i32x_info : X86VectorVTInfo<1, i32, GR32, "si">; +def i64x_info : X86VectorVTInfo<1, i64, GR64, "sq">; +def f32x_info : X86VectorVTInfo<1, f32, VR128X, "ss">; +def f64x_info : X86VectorVTInfo<1, f64, VR128X, "sd">; + +class AVX512VLVectorVTInfo<X86VectorVTInfo i512, X86VectorVTInfo i256, + X86VectorVTInfo i128> { + X86VectorVTInfo info512 = i512; + X86VectorVTInfo info256 = i256; + X86VectorVTInfo info128 = i128; +} + +def avx512vl_i8_info : AVX512VLVectorVTInfo<v64i8_info, v32i8x_info, + v16i8x_info>; +def avx512vl_i16_info : AVX512VLVectorVTInfo<v32i16_info, v16i16x_info, + v8i16x_info>; +def avx512vl_i32_info : AVX512VLVectorVTInfo<v16i32_info, v8i32x_info, + v4i32x_info>; +def avx512vl_i64_info : AVX512VLVectorVTInfo<v8i64_info, v4i64x_info, + v2i64x_info>; +def avx512vl_f32_info : AVX512VLVectorVTInfo<v16f32_info, v8f32x_info, + v4f32x_info>; +def avx512vl_f64_info : AVX512VLVectorVTInfo<v8f64_info, v4f64x_info, + v2f64x_info>; + +// This multiclass generates the masking variants from the non-masking +// variant. It only provides the assembly pieces for the masking variants. +// It assumes custom ISel patterns for masking which can be provided as +// template arguments. +multiclass AVX512_maskable_custom<bits<8> O, Format F, + dag Outs, + dag Ins, dag MaskingIns, dag ZeroMaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + list<dag> Pattern, + list<dag> MaskingPattern, + list<dag> ZeroMaskingPattern, + string MaskingConstraint = "", + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> { + let isCommutable = IsCommutable in + def NAME: AVX512<O, F, Outs, Ins, + OpcodeStr#"\t{"#AttSrcAsm#", $dst|"# + "$dst , "#IntelSrcAsm#"}", + Pattern, itin>; + + // Prefer over VMOV*rrk Pat<> + let AddedComplexity = 20 in + def NAME#k: AVX512<O, F, Outs, MaskingIns, + OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"# + "$dst {${mask}}, "#IntelSrcAsm#"}", + MaskingPattern, itin>, + EVEX_K { + // In case of the 3src subclass this is overridden with a let. + string Constraints = MaskingConstraint; + } + let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<> + def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns, + OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"# + "$dst {${mask}} {z}, "#IntelSrcAsm#"}", + ZeroMaskingPattern, + itin>, + EVEX_KZ; +} + + +// Common base class of AVX512_maskable and AVX512_maskable_3src. +multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, + dag Ins, dag MaskingIns, dag ZeroMaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, dag MaskingRHS, + SDNode Select = vselect, + string MaskingConstraint = "", + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> : + AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr, + AttSrcAsm, IntelSrcAsm, + [(set _.RC:$dst, RHS)], + [(set _.RC:$dst, MaskingRHS)], + [(set _.RC:$dst, + (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))], + MaskingConstraint, NoItinerary, IsCommutable>; + +// This multiclass generates the unconditional/non-masking, the masking and +// the zero-masking variant of the vector instruction. In the masking case, the +// perserved vector elements come from a new dummy input operand tied to $dst. +multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> : + AVX512_maskable_common<O, F, _, Outs, Ins, + !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), + !con((ins _.KRCWM:$mask), Ins), + OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, + (vselect _.KRCWM:$mask, RHS, _.RC:$src0), vselect, + "$src0 = $dst", itin, IsCommutable>; + +// This multiclass generates the unconditional/non-masking, the masking and +// the zero-masking variant of the scalar instruction. +multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> : + AVX512_maskable_common<O, F, _, Outs, Ins, + !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), + !con((ins _.KRCWM:$mask), Ins), + OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, + (X86select _.KRCWM:$mask, RHS, _.RC:$src0), X86select, + "$src0 = $dst", itin, IsCommutable>; + +// Similar to AVX512_maskable but in this case one of the source operands +// ($src1) is already tied to $dst so we just use that for the preserved +// vector elements. NOTE that the NonTiedIns (the ins dag) should exclude +// $src1. +multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag NonTiedIns, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS> : + AVX512_maskable_common<O, F, _, Outs, + !con((ins _.RC:$src1), NonTiedIns), + !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), + !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), + OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, + (vselect _.KRCWM:$mask, RHS, _.RC:$src1)>; + +// Similar to AVX512_maskable_3rc but in this case the input VT for the tied +// operand differs from the output VT. This requires a bitconvert on +// the preserved vector going into the vselect. +multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT, + X86VectorVTInfo InVT, + dag Outs, dag NonTiedIns, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS> : + AVX512_maskable_common<O, F, OutVT, Outs, + !con((ins InVT.RC:$src1), NonTiedIns), + !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns), + !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns), + OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, + (vselect InVT.KRCWM:$mask, RHS, + (bitconvert InVT.RC:$src1))>; + +multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag NonTiedIns, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS> : + AVX512_maskable_common<O, F, _, Outs, + !con((ins _.RC:$src1), NonTiedIns), + !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), + !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), + OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, + (X86select _.KRCWM:$mask, RHS, _.RC:$src1)>; + +multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + list<dag> Pattern> : + AVX512_maskable_custom<O, F, Outs, Ins, + !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), + !con((ins _.KRCWM:$mask), Ins), + OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [], + "$src0 = $dst">; + + +// Instruction with mask that puts result in mask register, +// like "compare" and "vptest" +multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F, + dag Outs, + dag Ins, dag MaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + list<dag> Pattern, + list<dag> MaskingPattern, + string Round = "", + InstrItinClass itin = NoItinerary> { + def NAME: AVX512<O, F, Outs, Ins, + OpcodeStr#"\t{"#AttSrcAsm#", $dst "#Round#"|"# + "$dst "#Round#", "#IntelSrcAsm#"}", + Pattern, itin>; + + def NAME#k: AVX512<O, F, Outs, MaskingIns, + OpcodeStr#"\t{"#Round#AttSrcAsm#", $dst {${mask}}|"# + "$dst {${mask}}, "#IntelSrcAsm#Round#"}", + MaskingPattern, itin>, EVEX_K; +} + +multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, + dag Ins, dag MaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, dag MaskingRHS, + string Round = "", + InstrItinClass itin = NoItinerary> : + AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr, + AttSrcAsm, IntelSrcAsm, + [(set _.KRC:$dst, RHS)], + [(set _.KRC:$dst, MaskingRHS)], + Round, NoItinerary>; + +multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, string Round = "", + InstrItinClass itin = NoItinerary> : + AVX512_maskable_common_cmp<O, F, _, Outs, Ins, + !con((ins _.KRCWM:$mask), Ins), + OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, + (and _.KRCWM:$mask, RHS), + Round, itin>; + +multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm> : + AVX512_maskable_custom_cmp<O, F, Outs, + Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr, + AttSrcAsm, IntelSrcAsm, + [],[],"", NoItinerary>; + +// Bitcasts between 512-bit vector types. Return the original type since +// no instruction is needed for the conversion +let Predicates = [HasAVX512] in { + def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v32i16 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v64i8 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v8f64 (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v32i16 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v64i8 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))), (v16f32 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v32i16 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v64i8 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v8f64 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v64i8 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v32i16 (bitconvert (v8i64 VR512:$src))), (v32i16 VR512:$src)>; + def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))), (v32i16 VR512:$src)>; + def : Pat<(v32i16 (bitconvert (v64i8 VR512:$src))), (v32i16 VR512:$src)>; + def : Pat<(v32i16 (bitconvert (v8f64 VR512:$src))), (v32i16 VR512:$src)>; + def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>; + def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>; + def : Pat<(v64i8 (bitconvert (v8i64 VR512:$src))), (v64i8 VR512:$src)>; + def : Pat<(v64i8 (bitconvert (v16i32 VR512:$src))), (v64i8 VR512:$src)>; + def : Pat<(v64i8 (bitconvert (v32i16 VR512:$src))), (v64i8 VR512:$src)>; + def : Pat<(v64i8 (bitconvert (v8f64 VR512:$src))), (v64i8 VR512:$src)>; + def : Pat<(v64i8 (bitconvert (v16f32 VR512:$src))), (v64i8 VR512:$src)>; + + def : Pat<(v2i64 (bitconvert (v4i32 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v2i64 (bitconvert (v8i16 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v2i64 (bitconvert (v16i8 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v2i64 (bitconvert (v2f64 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v2i64 (bitconvert (v4f32 VR128X:$src))), (v2i64 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v2i64 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v8i16 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v16i8 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v2f64 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v4i32 (bitconvert (v4f32 VR128X:$src))), (v4i32 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v2i64 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v4i32 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v16i8 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v2f64 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 VR128X:$src))), (v8i16 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v2i64 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v4i32 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v8i16 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v2f64 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 VR128X:$src))), (v16i8 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v4i32 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v4f32 (bitconvert (v2f64 VR128X:$src))), (v4f32 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v2i64 VR128X:$src))), (v2f64 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 VR128X:$src))), (v2f64 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 VR128X:$src))), (v2f64 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 VR128X:$src))), (v2f64 VR128X:$src)>; + def : Pat<(v2f64 (bitconvert (v4f32 VR128X:$src))), (v2f64 VR128X:$src)>; + +// Bitcasts between 256-bit vector types. Return the original type since +// no instruction is needed for the conversion + def : Pat<(v4f64 (bitconvert (v8f32 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v4f64 (bitconvert (v8i32 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v4f64 (bitconvert (v4i64 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v4f64 (bitconvert (v16i16 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v4f64 (bitconvert (v32i8 VR256X:$src))), (v4f64 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v8i32 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v4i64 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v4f64 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v32i8 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v8f32 (bitconvert (v16i16 VR256X:$src))), (v8f32 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v8f32 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v8i32 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v4f64 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v32i8 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v4i64 (bitconvert (v16i16 VR256X:$src))), (v4i64 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v4f64 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v4i64 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v8f32 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v8i32 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v32i8 (bitconvert (v16i16 VR256X:$src))), (v32i8 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v32i8 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v16i16 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v8f32 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v4i64 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v8i32 (bitconvert (v4f64 VR256X:$src))), (v8i32 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v8f32 VR256X:$src))), (v16i16 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v8i32 VR256X:$src))), (v16i16 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v4i64 VR256X:$src))), (v16i16 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v4f64 VR256X:$src))), (v16i16 VR256X:$src)>; + def : Pat<(v16i16 (bitconvert (v32i8 VR256X:$src))), (v16i16 VR256X:$src)>; +} + +// +// AVX-512: VPXOR instruction writes zero to its upper part, it's safe build zeros. +// + +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, Predicates = [HasAVX512] in { +def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "", + [(set VR512:$dst, (v16f32 immAllZerosV))]>; +} + +let Predicates = [HasAVX512] in { +def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>; +def : Pat<(v16i32 immAllZerosV), (AVX512_512_SET0)>; +def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>; +} + +//===----------------------------------------------------------------------===// +// AVX-512 - VECTOR INSERT +// +multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To, + PatFrag vinsert_insert> { + let hasSideEffects = 0, ExeDomain = To.ExeDomain in { + defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst), + (ins To.RC:$src1, From.RC:$src2, i32u8imm:$src3), + "vinsert" # From.EltTypeName # "x" # From.NumElts, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (vinsert_insert:$src3 (To.VT To.RC:$src1), + (From.VT From.RC:$src2), + (iPTR imm))>, AVX512AIi8Base, EVEX_4V; + + let mayLoad = 1 in + defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst), + (ins To.RC:$src1, From.MemOp:$src2, i32u8imm:$src3), + "vinsert" # From.EltTypeName # "x" # From.NumElts, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (vinsert_insert:$src3 (To.VT To.RC:$src1), + (From.VT (bitconvert (From.LdFrag addr:$src2))), + (iPTR imm))>, AVX512AIi8Base, EVEX_4V, + EVEX_CD8<From.EltSize, From.CD8TupleForm>; + } +} + +multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From, + X86VectorVTInfo To, PatFrag vinsert_insert, + SDNodeXForm INSERT_get_vinsert_imm , list<Predicate> p> { + let Predicates = p in { + def : Pat<(vinsert_insert:$ins + (To.VT To.RC:$src1), (From.VT From.RC:$src2), (iPTR imm)), + (To.VT (!cast<Instruction>(InstrStr#"rr") + To.RC:$src1, From.RC:$src2, + (INSERT_get_vinsert_imm To.RC:$ins)))>; + + def : Pat<(vinsert_insert:$ins + (To.VT To.RC:$src1), + (From.VT (bitconvert (From.LdFrag addr:$src2))), + (iPTR imm)), + (To.VT (!cast<Instruction>(InstrStr#"rm") + To.RC:$src1, addr:$src2, + (INSERT_get_vinsert_imm To.RC:$ins)))>; + } +} + +multiclass vinsert_for_type<ValueType EltVT32, int Opcode128, + ValueType EltVT64, int Opcode256> { + + let Predicates = [HasVLX] in + defm NAME # "32x4Z256" : vinsert_for_size<Opcode128, + X86VectorVTInfo< 4, EltVT32, VR128X>, + X86VectorVTInfo< 8, EltVT32, VR256X>, + vinsert128_insert>, EVEX_V256; + + defm NAME # "32x4Z" : vinsert_for_size<Opcode128, + X86VectorVTInfo< 4, EltVT32, VR128X>, + X86VectorVTInfo<16, EltVT32, VR512>, + vinsert128_insert>, EVEX_V512; + + defm NAME # "64x4Z" : vinsert_for_size<Opcode256, + X86VectorVTInfo< 4, EltVT64, VR256X>, + X86VectorVTInfo< 8, EltVT64, VR512>, + vinsert256_insert>, VEX_W, EVEX_V512; + + let Predicates = [HasVLX, HasDQI] in + defm NAME # "64x2Z256" : vinsert_for_size<Opcode128, + X86VectorVTInfo< 2, EltVT64, VR128X>, + X86VectorVTInfo< 4, EltVT64, VR256X>, + vinsert128_insert>, VEX_W, EVEX_V256; + + let Predicates = [HasDQI] in { + defm NAME # "64x2Z" : vinsert_for_size<Opcode128, + X86VectorVTInfo< 2, EltVT64, VR128X>, + X86VectorVTInfo< 8, EltVT64, VR512>, + vinsert128_insert>, VEX_W, EVEX_V512; + + defm NAME # "32x8Z" : vinsert_for_size<Opcode256, + X86VectorVTInfo< 8, EltVT32, VR256X>, + X86VectorVTInfo<16, EltVT32, VR512>, + vinsert256_insert>, EVEX_V512; + } +} + +defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a>; +defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>; + +// Codegen pattern with the alternative types, +// Only add this if 64x2 and its friends are not supported natively via AVX512DQ. +defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>; + +defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>; + +defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>; +defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>; + +// Codegen pattern with the alternative types insert VEC128 into VEC256 +defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; +// Codegen pattern with the alternative types insert VEC128 into VEC512 +defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; +// Codegen pattern with the alternative types insert VEC256 into VEC512 +defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; +defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; + +// vinsertps - insert f32 to XMM +def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2, u8imm:$src3), + "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>, + EVEX_4V; +def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), + (ins VR128X:$src1, f32mem:$src2, u8imm:$src3), + "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR128X:$dst, (X86insertps VR128X:$src1, + (v4f32 (scalar_to_vector (loadf32 addr:$src2))), + imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>; + +//===----------------------------------------------------------------------===// +// AVX-512 VECTOR EXTRACT +//--- + +multiclass vextract_for_size_first_position_lowering<X86VectorVTInfo From, + X86VectorVTInfo To> { + // A subvector extract from the first vector position is + // a subregister copy that needs no instruction. + def NAME # To.NumElts: + Pat<(To.VT (extract_subvector (From.VT From.RC:$src),(iPTR 0))), + (To.VT (EXTRACT_SUBREG (From.VT From.RC:$src), To.SubRegIdx))>; +} + +multiclass vextract_for_size<int Opcode, + X86VectorVTInfo From, X86VectorVTInfo To, + PatFrag vextract_extract> : + vextract_for_size_first_position_lowering<From, To> { + + let hasSideEffects = 0, ExeDomain = To.ExeDomain in { + // use AVX512_maskable_in_asm (AVX512_maskable can't be used due to + // vextract_extract), we interesting only in patterns without mask, + // intrinsics pattern match generated bellow. + defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst), + (ins From.RC:$src1, i32u8imm:$idx), + "vextract" # To.EltTypeName # "x" # To.NumElts, + "$idx, $src1", "$src1, $idx", + [(set To.RC:$dst, (vextract_extract:$idx (From.VT From.RC:$src1), + (iPTR imm)))]>, + AVX512AIi8Base, EVEX; + let mayStore = 1 in { + def rm : AVX512AIi8<Opcode, MRMDestMem, (outs), + (ins To.MemOp:$dst, From.RC:$src1, i32u8imm:$src2), + "vextract" # To.EltTypeName # "x" # To.NumElts # + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX; + + def rmk : AVX512AIi8<Opcode, MRMDestMem, (outs), + (ins To.MemOp:$dst, To.KRCWM:$mask, + From.RC:$src1, i32u8imm:$src2), + "vextract" # To.EltTypeName # "x" # To.NumElts # + "\t{$src2, $src1, $dst {${mask}}|" + "$dst {${mask}}, $src1, $src2}", + []>, EVEX_K, EVEX; + }//mayStore = 1 + } + + // Intrinsic call with masking. + def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # + "x" # To.NumElts # "_" # From.Size) + From.RC:$src1, (iPTR imm:$idx), To.RC:$src0, To.MRC:$mask), + (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # + From.ZSuffix # "rrk") + To.RC:$src0, + (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM), + From.RC:$src1, imm:$idx)>; + + // Intrinsic call with zero-masking. + def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # + "x" # To.NumElts # "_" # From.Size) + From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, To.MRC:$mask), + (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # + From.ZSuffix # "rrkz") + (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM), + From.RC:$src1, imm:$idx)>; + + // Intrinsic call without masking. + def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # + "x" # To.NumElts # "_" # From.Size) + From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)), + (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # + From.ZSuffix # "rr") + From.RC:$src1, imm:$idx)>; +} + +// Codegen pattern for the alternative types +multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From, + X86VectorVTInfo To, PatFrag vextract_extract, + SDNodeXForm EXTRACT_get_vextract_imm, list<Predicate> p> : + vextract_for_size_first_position_lowering<From, To> { + + let Predicates = p in + def : Pat<(vextract_extract:$ext (From.VT From.RC:$src1), (iPTR imm)), + (To.VT (!cast<Instruction>(InstrStr#"rr") + From.RC:$src1, + (EXTRACT_get_vextract_imm To.RC:$ext)))>; +} + +multiclass vextract_for_type<ValueType EltVT32, int Opcode128, + ValueType EltVT64, int Opcode256> { + defm NAME # "32x4Z" : vextract_for_size<Opcode128, + X86VectorVTInfo<16, EltVT32, VR512>, + X86VectorVTInfo< 4, EltVT32, VR128X>, + vextract128_extract>, + EVEX_V512, EVEX_CD8<32, CD8VT4>; + defm NAME # "64x4Z" : vextract_for_size<Opcode256, + X86VectorVTInfo< 8, EltVT64, VR512>, + X86VectorVTInfo< 4, EltVT64, VR256X>, + vextract256_extract>, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>; + let Predicates = [HasVLX] in + defm NAME # "32x4Z256" : vextract_for_size<Opcode128, + X86VectorVTInfo< 8, EltVT32, VR256X>, + X86VectorVTInfo< 4, EltVT32, VR128X>, + vextract128_extract>, + EVEX_V256, EVEX_CD8<32, CD8VT4>; + let Predicates = [HasVLX, HasDQI] in + defm NAME # "64x2Z256" : vextract_for_size<Opcode128, + X86VectorVTInfo< 4, EltVT64, VR256X>, + X86VectorVTInfo< 2, EltVT64, VR128X>, + vextract128_extract>, + VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>; + let Predicates = [HasDQI] in { + defm NAME # "64x2Z" : vextract_for_size<Opcode128, + X86VectorVTInfo< 8, EltVT64, VR512>, + X86VectorVTInfo< 2, EltVT64, VR128X>, + vextract128_extract>, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>; + defm NAME # "32x8Z" : vextract_for_size<Opcode256, + X86VectorVTInfo<16, EltVT32, VR512>, + X86VectorVTInfo< 8, EltVT32, VR256X>, + vextract256_extract>, + EVEX_V512, EVEX_CD8<32, CD8VT8>; + } +} + +defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b>; +defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b>; + +// extract_subvector codegen patterns with the alternative types. +// Only add this if 64x2 and its friends are not supported natively via AVX512DQ. +defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>; +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>; + +defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>; +defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>; + +defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>; +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>; + +// Codegen pattern with the alternative types extract VEC128 from VEC512 +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; +// Codegen pattern with the alternative types extract VEC256 from VEC512 +defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; +defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; + +// A 128-bit subvector insert to the first 512-bit vector position +// is a subregister copy that needs no instruction. +def : Pat<(insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0)), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), + (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + sub_ymm)>; +def : Pat<(insert_subvector undef, (v2f64 VR128X:$src), (iPTR 0)), + (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), + (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + sub_ymm)>; +def : Pat<(insert_subvector undef, (v4i32 VR128X:$src), (iPTR 0)), + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + sub_ymm)>; +def : Pat<(insert_subvector undef, (v4f32 VR128X:$src), (iPTR 0)), + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), + (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + sub_ymm)>; + +def : Pat<(insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v4f64 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v16i16 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v32i8 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; + +// vextractps - extract 32 bits from XMM +def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst), + (ins VR128X:$src1, u8imm:$src2), + "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>, + EVEX; + +def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs), + (ins f32mem:$dst, VR128X:$src1, u8imm:$src2), + "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2), + addr:$dst)]>, EVEX, EVEX_CD8<32, CD8VT1>; + +//===---------------------------------------------------------------------===// +// AVX-512 BROADCAST +//--- + +multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, + X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> { + + defm r : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src", + (DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))>, + T8PD, EVEX; + let mayLoad = 1 in + defm m : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src", + (DestInfo.VT (X86VBroadcast + (SrcInfo.ScalarLdFrag addr:$src)))>, + T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>; +} + +multiclass avx512_fp_broadcast_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _> { + defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>, + EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>, + EVEX_V256; + } +} + +let ExeDomain = SSEPackedSingle in { + defm VBROADCASTSS : avx512_fp_broadcast_vl<0x18, "vbroadcastss", + avx512vl_f32_info>; + let Predicates = [HasVLX] in { + defm VBROADCASTSSZ128 : avx512_broadcast_rm<0x18, "vbroadcastss", + v4f32x_info, v4f32x_info>, EVEX_V128; + } +} + +let ExeDomain = SSEPackedDouble in { + defm VBROADCASTSD : avx512_fp_broadcast_vl<0x19, "vbroadcastsd", + avx512vl_f64_info>, VEX_W; +} + +// avx512_broadcast_pat introduces patterns for broadcast with a scalar argument. +// Later, we can canonize broadcast instructions before ISel phase and +// eliminate additional patterns on ISel. +// SrcRC_v and SrcRC_s are RegisterClasses for vector and scalar +// representations of source +multiclass avx512_broadcast_pat<string InstName, SDNode OpNode, + X86VectorVTInfo _, RegisterClass SrcRC_v, + RegisterClass SrcRC_s> { + def : Pat<(_.VT (OpNode (_.EltVT SrcRC_s:$src))), + (!cast<Instruction>(InstName##"r") + (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>; + + let AddedComplexity = 30 in { + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (_.EltVT SrcRC_s:$src)), _.RC:$src0)), + (!cast<Instruction>(InstName##"rk") _.RC:$src0, _.KRCWM:$mask, + (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>; + + def : Pat<(_.VT(vselect _.KRCWM:$mask, + (OpNode (_.EltVT SrcRC_s:$src)), _.ImmAllZerosV)), + (!cast<Instruction>(InstName##"rkz") _.KRCWM:$mask, + (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>; + } +} + +defm : avx512_broadcast_pat<"VBROADCASTSSZ", X86VBroadcast, v16f32_info, + VR128X, FR32X>; +defm : avx512_broadcast_pat<"VBROADCASTSDZ", X86VBroadcast, v8f64_info, + VR128X, FR64X>; + +let Predicates = [HasVLX] in { + defm : avx512_broadcast_pat<"VBROADCASTSSZ256", X86VBroadcast, + v8f32x_info, VR128X, FR32X>; + defm : avx512_broadcast_pat<"VBROADCASTSSZ128", X86VBroadcast, + v4f32x_info, VR128X, FR32X>; + defm : avx512_broadcast_pat<"VBROADCASTSDZ256", X86VBroadcast, + v4f64x_info, VR128X, FR64X>; +} + +def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))), + (VBROADCASTSSZm addr:$src)>; +def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))), + (VBROADCASTSDZm addr:$src)>; + +def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src), + (VBROADCASTSSZm addr:$src)>; +def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src), + (VBROADCASTSDZm addr:$src)>; + +multiclass avx512_int_broadcast_reg<bits<8> opc, X86VectorVTInfo _, + RegisterClass SrcRC> { + defm r : AVX512_maskable_in_asm<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins SrcRC:$src), "vpbroadcast"##_.Suffix, + "$src", "$src", []>, T8PD, EVEX; +} + +multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _, + RegisterClass SrcRC, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_int_broadcast_reg<opc, _.info512, SrcRC>, EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_int_broadcast_reg<opc, _.info256, SrcRC>, EVEX_V256; + defm Z128 : avx512_int_broadcast_reg<opc, _.info128, SrcRC>, EVEX_V128; + } +} + +defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info, GR32, + HasBWI>; +defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, GR32, + HasBWI>; +defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, GR32, + HasAVX512>; +defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, GR64, + HasAVX512>, VEX_W; + +def : Pat <(v16i32 (X86vzext VK16WM:$mask)), + (VPBROADCASTDrZrkz VK16WM:$mask, (i32 (MOV32ri 0x1)))>; + +def : Pat <(v8i64 (X86vzext VK8WM:$mask)), + (VPBROADCASTQrZrkz VK8WM:$mask, (i64 (MOV64ri 0x1)))>; + +def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))), + (VPBROADCASTDrZr GR32:$src)>; +def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))), + (VPBROADCASTQrZr GR64:$src)>; + +def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))), + (VPBROADCASTDrZr GR32:$src)>; +def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_i64_512 (i64 GR64:$src))), + (VPBROADCASTQrZr GR64:$src)>; + +def : Pat<(v16i32 (int_x86_avx512_mask_pbroadcast_d_gpr_512 (i32 GR32:$src), + (v16i32 immAllZerosV), (i16 GR16:$mask))), + (VPBROADCASTDrZrkz (COPY_TO_REGCLASS GR16:$mask, VK16WM), GR32:$src)>; +def : Pat<(v8i64 (int_x86_avx512_mask_pbroadcast_q_gpr_512 (i64 GR64:$src), + (bc_v8i64 (v16i32 immAllZerosV)), (i8 GR8:$mask))), + (VPBROADCASTQrZrkz (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>; + +// Provide aliases for broadcast from the same register class that +// automatically does the extract. +multiclass avx512_int_broadcast_rm_lowering<X86VectorVTInfo DestInfo, + X86VectorVTInfo SrcInfo> { + def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))), + (!cast<Instruction>(NAME#DestInfo.ZSuffix#"r") + (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm))>; +} + +multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, Predicate prd> { + let Predicates = [prd] in { + defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>, + avx512_int_broadcast_rm_lowering<_.info512, _.info256>, + EVEX_V512; + // Defined separately to avoid redefinition. + defm Z_Alt : avx512_int_broadcast_rm_lowering<_.info512, _.info512>; + } + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>, + avx512_int_broadcast_rm_lowering<_.info256, _.info256>, + EVEX_V256; + defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _.info128, _.info128>, + EVEX_V128; + } +} + +defm VPBROADCASTB : avx512_int_broadcast_rm_vl<0x78, "vpbroadcastb", + avx512vl_i8_info, HasBWI>; +defm VPBROADCASTW : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw", + avx512vl_i16_info, HasBWI>; +defm VPBROADCASTD : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd", + avx512vl_i32_info, HasAVX512>; +defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq", + avx512vl_i64_info, HasAVX512>, VEX_W; + +multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _Dst, X86VectorVTInfo _Src> { + let mayLoad = 1 in + defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src", + (_Dst.VT (X86SubVBroadcast + (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>, + AVX5128IBase, EVEX; +} + +defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", + v16i32_info, v4i32x_info>, + EVEX_V512, EVEX_CD8<32, CD8VT4>; +defm VBROADCASTF32X4 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4", + v16f32_info, v4f32x_info>, + EVEX_V512, EVEX_CD8<32, CD8VT4>; +defm VBROADCASTI64X4 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti64x4", + v8i64_info, v4i64x_info>, VEX_W, + EVEX_V512, EVEX_CD8<64, CD8VT4>; +defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4", + v8f64_info, v4f64x_info>, VEX_W, + EVEX_V512, EVEX_CD8<64, CD8VT4>; + +let Predicates = [HasVLX] in { +defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", + v8i32x_info, v4i32x_info>, + EVEX_V256, EVEX_CD8<32, CD8VT4>; +defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4", + v8f32x_info, v4f32x_info>, + EVEX_V256, EVEX_CD8<32, CD8VT4>; +} +let Predicates = [HasVLX, HasDQI] in { +defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2", + v4i64x_info, v2i64x_info>, VEX_W, + EVEX_V256, EVEX_CD8<64, CD8VT2>; +defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2", + v4f64x_info, v2f64x_info>, VEX_W, + EVEX_V256, EVEX_CD8<64, CD8VT2>; +} +let Predicates = [HasDQI] in { +defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2", + v8i64_info, v2i64x_info>, VEX_W, + EVEX_V512, EVEX_CD8<64, CD8VT2>; +defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti32x8", + v16i32_info, v8i32x_info>, + EVEX_V512, EVEX_CD8<32, CD8VT8>; +defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2", + v8f64_info, v2f64x_info>, VEX_W, + EVEX_V512, EVEX_CD8<64, CD8VT2>; +defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8", + v16f32_info, v8f32x_info>, + EVEX_V512, EVEX_CD8<32, CD8VT8>; +} + +multiclass avx512_broadcast_32x2<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _Dst, X86VectorVTInfo _Src, + SDNode OpNode = X86SubVBroadcast> { + + defm r : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src), OpcodeStr, "$src", "$src", + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src)))>, + T8PD, EVEX; + let mayLoad = 1 in + defm m : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.ScalarMemOp:$src), OpcodeStr, "$src", "$src", + (_Dst.VT (OpNode + (_Src.VT (scalar_to_vector(loadi64 addr:$src)))))>, + T8PD, EVEX, EVEX_CD8<_Src.EltSize, CD8VT2>; +} + +multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _> { + let Predicates = [HasDQI] in + defm Z : avx512_broadcast_32x2<opc, OpcodeStr, _.info512, _.info128>, + EVEX_V512; + let Predicates = [HasDQI, HasVLX] in + defm Z256 : avx512_broadcast_32x2<opc, OpcodeStr, _.info256, _.info128>, + EVEX_V256; +} + +multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _> : + avx512_common_broadcast_32x2<opc, OpcodeStr, _> { + + let Predicates = [HasDQI, HasVLX] in + defm Z128 : avx512_broadcast_32x2<opc, OpcodeStr, _.info128, _.info128, + X86SubV32x2Broadcast>, EVEX_V128; +} + +defm VPBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2", + avx512vl_i32_info>; +defm VPBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2", + avx512vl_f32_info>; + +def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))), + (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; +def : Pat<(v16f32 (X86VBroadcast (v8f32 VR256X:$src))), + (VBROADCASTSSZr (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm))>; + +def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))), + (VBROADCASTSDZr (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>; +def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))), + (VBROADCASTSDZr (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v16f32 (X86VBroadcast FR32X:$src)), + (VBROADCASTSSZr (COPY_TO_REGCLASS FR32X:$src, VR128X))>; +def : Pat<(v8f64 (X86VBroadcast FR64X:$src)), + (VBROADCASTSDZr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; + + +//===----------------------------------------------------------------------===// +// AVX-512 BROADCAST MASK TO VECTOR REGISTER +//--- +multiclass avx512_mask_broadcastm<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, RegisterClass KRC> { + def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.RC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>, EVEX; +} + +multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, RegisterClass KRC> { + let Predicates = [HasCDI] in + defm Z : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info512, KRC>, EVEX_V512; + let Predicates = [HasCDI, HasVLX] in { + defm Z256 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info256, KRC>, EVEX_V256; + defm Z128 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info128, KRC>, EVEX_V128; + } +} + +defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", + avx512vl_i32_info, VK16>; +defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", + avx512vl_i64_info, VK8>, VEX_W; + +//===----------------------------------------------------------------------===// +// -- VPERMI2 - 3 source operands form -- +multiclass avx512_perm_i<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { +let Constraints = "$src1 = $dst" in { + defm rr: AVX512_maskable_3src_cast<opc, MRMSrcReg, _, IdxVT, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V, + AVX5128IBase; + + let mayLoad = 1 in + defm rm: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2, + (_.VT (bitconvert (_.LdFrag addr:$src3)))))>, + EVEX_4V, AVX5128IBase; + } +} +multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { + let mayLoad = 1, Constraints = "$src1 = $dst" in + defm rmb: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3), + OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), + !strconcat("$src2, ${src3}", _.BroadcastStr ), + (_.VT (X86VPermi2X IdxVT.RC:$src1, + _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>, + AVX5128IBase, EVEX_4V, EVEX_B; +} + +multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo ShuffleMask> { + defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512, + ShuffleMask.info512>, + avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info512, + ShuffleMask.info512>, EVEX_V512; + let Predicates = [HasVLX] in { + defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128, + ShuffleMask.info128>, + avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info128, + ShuffleMask.info128>, EVEX_V128; + defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256, + ShuffleMask.info256>, + avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info256, + ShuffleMask.info256>, EVEX_V256; + } +} + +multiclass avx512_perm_i_sizes_w<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo Idx> { + let Predicates = [HasBWI] in + defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512, + Idx.info512>, EVEX_V512; + let Predicates = [HasBWI, HasVLX] in { + defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128, + Idx.info128>, EVEX_V128; + defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256, + Idx.info256>, EVEX_V256; + } +} + +defm VPERMI2D : avx512_perm_i_sizes<0x76, "vpermi2d", + avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMI2Q : avx512_perm_i_sizes<0x76, "vpermi2q", + avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMI2W : avx512_perm_i_sizes_w<0x75, "vpermi2w", + avx512vl_i16_info, avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>; +defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps", + avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd", + avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; + +// VPERMT2 +multiclass avx512_perm_t<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { +let Constraints = "$src1 = $dst" in { + defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins IdxVT.RC:$src2, _.RC:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3))>, EVEX_4V, + AVX5128IBase; + + let mayLoad = 1 in + defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins IdxVT.RC:$src2, _.MemOp:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, + (bitconvert (_.LdFrag addr:$src3))))>, + EVEX_4V, AVX5128IBase; + } +} +multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { + let mayLoad = 1, Constraints = "$src1 = $dst" in + defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins IdxVT.RC:$src2, _.ScalarMemOp:$src3), + OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), + !strconcat("$src2, ${src3}", _.BroadcastStr ), + (_.VT (X86VPermt2 _.RC:$src1, + IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>, + AVX5128IBase, EVEX_4V, EVEX_B; +} + +multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo ShuffleMask> { + defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512, + ShuffleMask.info512>, + avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info512, + ShuffleMask.info512>, EVEX_V512; + let Predicates = [HasVLX] in { + defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128, + ShuffleMask.info128>, + avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info128, + ShuffleMask.info128>, EVEX_V128; + defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256, + ShuffleMask.info256>, + avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info256, + ShuffleMask.info256>, EVEX_V256; + } +} + +multiclass avx512_perm_t_sizes_w<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo Idx> { + let Predicates = [HasBWI] in + defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512, + Idx.info512>, EVEX_V512; + let Predicates = [HasBWI, HasVLX] in { + defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128, + Idx.info128>, EVEX_V128; + defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256, + Idx.info256>, EVEX_V256; + } +} + +defm VPERMT2D : avx512_perm_t_sizes<0x7E, "vpermt2d", + avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMT2Q : avx512_perm_t_sizes<0x7E, "vpermt2q", + avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMT2W : avx512_perm_t_sizes_w<0x7D, "vpermt2w", + avx512vl_i16_info, avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>; +defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps", + avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd", + avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; + +//===----------------------------------------------------------------------===// +// AVX-512 - BLEND using mask +// +multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} |${dst}, $src1, $src2}"), + []>, EVEX_4V; + def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), + [(set _.RC:$dst, (X86select _.KRCWM:$mask, (_.VT _.RC:$src1), + (_.VT _.RC:$src2)))]>, EVEX_4V, EVEX_K; + def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"), + []>, EVEX_4V, EVEX_KZ; + let mayLoad = 1 in { + def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} |${dst}, $src1, $src2}"), + []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), + [(set _.RC:$dst, (X86select _.KRCWM:$mask, (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>, + EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>; + def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"), + []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>; + } + } +} +multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { + + def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2), + !strconcat(OpcodeStr, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), + [(set _.RC:$dst,(X86select _.KRCWM:$mask, (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2))))]>, + EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + + def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), + !strconcat(OpcodeStr, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst|", + "$dst, $src1, ${src2}", _.BroadcastStr, "}"), + []>, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + +} + +multiclass blendmask_dq <bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + defm Z : avx512_blendmask <opc, OpcodeStr, VTInfo.info512>, + avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info512>, EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : avx512_blendmask<opc, OpcodeStr, VTInfo.info256>, + avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info256>, EVEX_V256; + defm Z128 : avx512_blendmask<opc, OpcodeStr, VTInfo.info128>, + avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info128>, EVEX_V128; + } +} + +multiclass blendmask_bw <bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + let Predicates = [HasBWI] in + defm Z : avx512_blendmask <opc, OpcodeStr, VTInfo.info512>, EVEX_V512; + + let Predicates = [HasBWI, HasVLX] in { + defm Z256 : avx512_blendmask <opc, OpcodeStr, VTInfo.info256>, EVEX_V256; + defm Z128 : avx512_blendmask <opc, OpcodeStr, VTInfo.info128>, EVEX_V128; + } +} + + +defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", avx512vl_f32_info>; +defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", avx512vl_f64_info>, VEX_W; +defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", avx512vl_i32_info>; +defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", avx512vl_i64_info>, VEX_W; +defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>; +defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W; + + +let Predicates = [HasAVX512] in { +def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1), + (v8f32 VR256X:$src2))), + (EXTRACT_SUBREG + (v16f32 (VBLENDMPSZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), + (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; + +def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), + (v8i32 VR256X:$src2))), + (EXTRACT_SUBREG + (v16i32 (VPBLENDMDZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; +} +//===----------------------------------------------------------------------===// +// Compare Instructions +//===----------------------------------------------------------------------===// + +// avx512_cmp_scalar - AVX512 CMPSS and CMPSD + +multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>{ + + defm rr_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc)>, EVEX_4V; + let mayLoad = 1 in + defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), + imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; + + defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "{sae}, $src2, $src1", "$src1, $src2,{sae}", + (OpNodeRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc, + (i32 FROUND_NO_EXC))>, EVEX_4V, EVEX_B; + // Accept explicit immediate argument form instead of comparison code. + let isAsmParserOnly = 1, hasSideEffects = 0 in { + defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, + (outs VK1:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V; + defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc">, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; + + defm rrb_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc,{sae}, $src2, $src1","$src1, $src2,{sae}, $cc">, + EVEX_4V, EVEX_B; + }// let isAsmParserOnly = 1, hasSideEffects = 0 + + let isCodeGenOnly = 1 in { + def rr : AVX512Ii8<0xC2, MRMSrcReg, + (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, AVXCC:$cc), + !strconcat("vcmp${cc}", _.Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRC:$dst, (OpNode _.FRC:$src1, + _.FRC:$src2, + imm:$cc))], + IIC_SSE_ALU_F32S_RR>, EVEX_4V; + let mayLoad = 1 in + def rm : AVX512Ii8<0xC2, MRMSrcMem, + (outs _.KRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc), + !strconcat("vcmp${cc}", _.Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRC:$dst, (OpNode _.FRC:$src1, + (_.ScalarLdFrag addr:$src2), + imm:$cc))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; + } +} + +let Predicates = [HasAVX512] in { + defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd>, + AVX512XSIi8Base; + defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd>, + AVX512XDIi8Base, VEX_W; +} + +multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + def rr : AVX512BI<opc, MRMSrcReg, + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))], + IIC_SSE_ALU_F32P_RR>, EVEX_4V; + let mayLoad = 1 in + def rm : AVX512BI<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2)))))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V; + def rrk : AVX512BI<opc, MRMSrcReg, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))], + IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K; + let mayLoad = 1 in + def rmk : AVX512BI<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert + (_.LdFrag addr:$src2))))))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K; +} + +multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> : + avx512_icmp_packed<opc, OpcodeStr, OpNode, _> { + let mayLoad = 1 in { + def rmb : AVX512BI<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), + !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst", + "|$dst, $src1, ${src2}", _.BroadcastStr, "}"), + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2))))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; + def rmbk : AVX512BI<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, + _.ScalarMemOp:$src2), + !strconcat(OpcodeStr, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (X86VBroadcast + (_.ScalarLdFrag addr:$src2)))))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B; + } +} + +multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info512>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info128>, + EVEX_V128; + } +} + +multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr, + SDNode OpNode, AVX512VLVectorVTInfo VTInfo, + Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info512>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info128>, + EVEX_V128; + } +} + +defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm, + avx512vl_i8_info, HasBWI>, + EVEX_CD8<8, CD8VF>; + +defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm, + avx512vl_i16_info, HasBWI>, + EVEX_CD8<16, CD8VF>; + +defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm, + avx512vl_i32_info, HasAVX512>, + EVEX_CD8<32, CD8VF>; + +defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm, + avx512vl_i64_info, HasAVX512>, + T8PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, + avx512vl_i8_info, HasBWI>, + EVEX_CD8<8, CD8VF>; + +defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, + avx512vl_i16_info, HasBWI>, + EVEX_CD8<16, CD8VF>; + +defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, + avx512vl_i32_info, HasAVX512>, + EVEX_CD8<32, CD8VF>; + +defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, + avx512vl_i64_info, HasAVX512>, + T8PD, VEX_W, EVEX_CD8<64, CD8VF>; + +def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (COPY_TO_REGCLASS (VPCMPGTDZrr + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; + +def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (COPY_TO_REGCLASS (VPCMPEQDZrr + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; + +multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, + X86VectorVTInfo _> { + def rri : AVX512AIi8<opc, MRMSrcReg, + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + imm:$cc))], + IIC_SSE_ALU_F32P_RR>, EVEX_4V; + let mayLoad = 1 in + def rmi : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + imm:$cc))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V; + def rrik : AVX512AIi8<opc, MRMSrcReg, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, + AVX512ICC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{$src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + imm:$cc)))], + IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K; + let mayLoad = 1 in + def rmik : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2, + AVX512ICC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{$src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + imm:$cc)))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K; + + // Accept explicit immediate argument form instead of comparison code. + let isAsmParserOnly = 1, hasSideEffects = 0 in { + def rri_alt : AVX512AIi8<opc, MRMSrcReg, + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|", + "$dst, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32P_RR>, EVEX_4V; + let mayLoad = 1 in + def rmi_alt : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), + !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|", + "$dst, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + def rrik_alt : AVX512AIi8<opc, MRMSrcReg, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, + u8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, $src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K; + let mayLoad = 1 in + def rmik_alt : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2, + u8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, $src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K; + } +} + +multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode, + X86VectorVTInfo _> : + avx512_icmp_cc<opc, Suffix, OpNode, _> { + def rmib : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, + AVX512ICC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst|", + "$dst, $src1, ${src2}", _.BroadcastStr, "}"), + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + imm:$cc))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; + def rmibk : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, + _.ScalarMemOp:$src2, AVX512ICC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + imm:$cc)))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B; + + // Accept explicit immediate argument form instead of comparison code. + let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in { + def rmib_alt : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, + u8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|", + "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"), + [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; + def rmibk_alt : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, + _.ScalarMemOp:$src2, u8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"), + [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B; + } +} + +multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info512>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info256>, EVEX_V256; + defm Z128 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info128>, EVEX_V128; + } +} + +multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info512>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info128>, + EVEX_V128; + } +} + +defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, avx512vl_i8_info, + HasBWI>, EVEX_CD8<8, CD8VF>; +defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, avx512vl_i8_info, + HasBWI>, EVEX_CD8<8, CD8VF>; + +defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, avx512vl_i16_info, + HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; +defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, avx512vl_i16_info, + HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; + +defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, avx512vl_i32_info, + HasAVX512>, EVEX_CD8<32, CD8VF>; +defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, avx512vl_i32_info, + HasAVX512>, EVEX_CD8<32, CD8VF>; + +defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info, + HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info, + HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; + +multiclass avx512_vcmp_common<X86VectorVTInfo _> { + + defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "$src2, $src1", "$src1, $src2", + (X86cmpm (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc)>; + + let mayLoad = 1 in { + defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "$src2, $src1", "$src1, $src2", + (X86cmpm (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + imm:$cc)>; + + defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (X86cmpm (_.VT _.RC:$src1), + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + imm:$cc)>,EVEX_B; + } + // Accept explicit immediate argument form instead of comparison code. + let isAsmParserOnly = 1, hasSideEffects = 0 in { + defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc">; + + let mayLoad = 1 in { + defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc">; + + defm rmbi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, ${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr##", $cc">,EVEX_B; + } + } +} + +multiclass avx512_vcmp_sae<X86VectorVTInfo _> { + // comparison code form (VCMP[EQ/LT/LE/...] + defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "{sae}, $src2, $src1", "$src1, $src2,{sae}", + (X86cmpmRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc, + (i32 FROUND_NO_EXC))>, EVEX_B; + + let isAsmParserOnly = 1, hasSideEffects = 0 in { + defm rrib_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc,{sae}, $src2, $src1", + "$src1, $src2,{sae}, $cc">, EVEX_B; + } +} + +multiclass avx512_vcmp<AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcmp_common<_.info512>, + avx512_vcmp_sae<_.info512>, EVEX_V512; + + } + let Predicates = [HasAVX512,HasVLX] in { + defm Z128 : avx512_vcmp_common<_.info128>, EVEX_V128; + defm Z256 : avx512_vcmp_common<_.info256>, EVEX_V256; + } +} + +defm VCMPPD : avx512_vcmp<avx512vl_f64_info>, + AVX512PDIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; +defm VCMPPS : avx512_vcmp<avx512vl_f32_info>, + AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; + +def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)), + (COPY_TO_REGCLASS (VCMPPSZrri + (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), + (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), + imm:$cc), VK8)>; +def : Pat<(v8i1 (X86cmpm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), + (COPY_TO_REGCLASS (VPCMPDZrri + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), + imm:$cc), VK8)>; +def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), + (COPY_TO_REGCLASS (VPCMPUDZrri + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), + imm:$cc), VK8)>; + +// ---------------------------------------------------------------- +// FPClass +//handle fpclass instruction mask = op(reg_scalar,imm) +// op(mem_scalar,imm) +multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, Predicate prd> { + let Predicates = [prd] in { + def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),//_.KRC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst | $dst, $src1, $src2}", + [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1), + (i32 imm:$src2)))], NoItinerary>; + def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix# + "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst,(or _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2))))], NoItinerary>, EVEX_K; + let mayLoad = 1, AddedComplexity = 20 in { + def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix## + "\t{$src2, $src1, $dst | $dst, $src1, $src2}", + [(set _.KRC:$dst, + (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2)))], NoItinerary>; + def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix## + "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst,(or _.KRCWM:$mask, + (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2))))], NoItinerary>, EVEX_K; + } + } +} + +//handle fpclass instruction mask = fpclass(reg_vec, reg_vec, imm) +// fpclass(reg_vec, mem_vec, imm) +// fpclass(reg_vec, broadcast(eltVt), imm) +multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, string mem, string broadcast>{ + def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst | $dst, $src1, $src2}", + [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1), + (i32 imm:$src2)))], NoItinerary>; + def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix# + "\t{$src2, $src1, $dst {${mask}}| $dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst,(or _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2))))], NoItinerary>, EVEX_K; + let mayLoad = 1 in { + def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##mem# + "\t{$src2, $src1, $dst | $dst, $src1, $src2}", + [(set _.KRC:$dst,(OpNode + (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2)))], NoItinerary>; + def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##mem# + "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst, (or _.KRCWM:$mask, (OpNode + (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2))))], NoItinerary>, EVEX_K; + def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.ScalarMemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"## + _.BroadcastStr##", $dst | $dst, ${src1}" + ##_.BroadcastStr##", $src2}", + [(set _.KRC:$dst,(OpNode + (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src1))), + (i32 imm:$src2)))], NoItinerary>,EVEX_B; + def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"## + _.BroadcastStr##", $dst {${mask}} | $dst {${mask}}, ${src1}"## + _.BroadcastStr##", $src2}", + [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode + (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src1))), + (i32 imm:$src2))))], NoItinerary>, + EVEX_B, EVEX_K; + } +} + +multiclass avx512_vector_fpclass_all<string OpcodeStr, + AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd, + string broadcast>{ + let Predicates = [prd] in { + defm Z : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info512, "{z}", + broadcast>, EVEX_V512; + } + let Predicates = [prd, HasVLX] in { + defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info128, "{x}", + broadcast>, EVEX_V128; + defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info256, "{y}", + broadcast>, EVEX_V256; + } +} + +multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec, + bits<8> opcScalar, SDNode VecOpNode, SDNode ScalarOpNode, Predicate prd>{ + defm PS : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f32_info, opcVec, + VecOpNode, prd, "{l}">, EVEX_CD8<32, CD8VF>; + defm PD : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f64_info, opcVec, + VecOpNode, prd, "{q}">,EVEX_CD8<64, CD8VF> , VEX_W; + defm SS : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode, + f32x_info, prd>, EVEX_CD8<32, CD8VT1>; + defm SD : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode, + f64x_info, prd>, EVEX_CD8<64, CD8VT1>, VEX_W; +} + +defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass, + X86Vfpclasss, HasDQI>, AVX512AIi8Base,EVEX; + +//----------------------------------------------------------------- +// Mask register copy, including +// - copy between mask registers +// - load/store mask registers +// - copy from GPR to mask register and vice versa +// +multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk, + string OpcodeStr, RegisterClass KRC, + ValueType vvt, X86MemOperand x86memop> { + let hasSideEffects = 0 in { + def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; + let mayLoad = 1 in + def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set KRC:$dst, (vvt (load addr:$src)))]>; + let mayStore = 1 in + def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(store KRC:$src, addr:$dst)]>; + } +} + +multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk, + string OpcodeStr, + RegisterClass KRC, RegisterClass GRC> { + let hasSideEffects = 0 in { + def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; + def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; + } +} + +let Predicates = [HasDQI] in + defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem>, + avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>, + VEX, PD; + +let Predicates = [HasAVX512] in + defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>, + avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>, + VEX, PS; + +let Predicates = [HasBWI] in { + defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem>, + VEX, PD, VEX_W; + defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>, + VEX, XD; +} + +let Predicates = [HasBWI] in { + defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>, + VEX, PS, VEX_W; + defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>, + VEX, XD, VEX_W; +} + +// GR from/to mask register +let Predicates = [HasDQI] in { + def : Pat<(v8i1 (bitconvert (i8 GR8:$src))), + (KMOVBkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit))>; + def : Pat<(i8 (bitconvert (v8i1 VK8:$src))), + (EXTRACT_SUBREG (KMOVBrk VK8:$src), sub_8bit)>; +} +let Predicates = [HasAVX512] in { + def : Pat<(v16i1 (bitconvert (i16 GR16:$src))), + (KMOVWkr (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit))>; + def : Pat<(i16 (bitconvert (v16i1 VK16:$src))), + (EXTRACT_SUBREG (KMOVWrk VK16:$src), sub_16bit)>; +} +let Predicates = [HasBWI] in { + def : Pat<(v32i1 (bitconvert (i32 GR32:$src))), (KMOVDkr GR32:$src)>; + def : Pat<(i32 (bitconvert (v32i1 VK32:$src))), (KMOVDrk VK32:$src)>; +} +let Predicates = [HasBWI] in { + def : Pat<(v64i1 (bitconvert (i64 GR64:$src))), (KMOVQkr GR64:$src)>; + def : Pat<(i64 (bitconvert (v64i1 VK64:$src))), (KMOVQrk VK64:$src)>; +} + +// Load/store kreg +let Predicates = [HasDQI] in { + def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), + (KMOVBmk addr:$dst, VK8:$src)>; + def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), + (KMOVBkm addr:$src)>; + + def : Pat<(store VK4:$src, addr:$dst), + (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK4:$src, VK8))>; + def : Pat<(store VK2:$src, addr:$dst), + (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK2:$src, VK8))>; +} +let Predicates = [HasAVX512, NoDQI] in { + def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), + (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>; + def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), + (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>; +} +let Predicates = [HasAVX512] in { + def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst), + (KMOVWmk addr:$dst, VK16:$src)>; + def : Pat<(i1 (load addr:$src)), + (COPY_TO_REGCLASS (AND16ri (i16 (SUBREG_TO_REG (i32 0), + (MOV8rm addr:$src), sub_8bit)), + (i16 1)), VK1)>; + def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))), + (KMOVWkm addr:$src)>; +} +let Predicates = [HasBWI] in { + def : Pat<(store (i32 (bitconvert (v32i1 VK32:$src))), addr:$dst), + (KMOVDmk addr:$dst, VK32:$src)>; + def : Pat<(v32i1 (bitconvert (i32 (load addr:$src)))), + (KMOVDkm addr:$src)>; +} +let Predicates = [HasBWI] in { + def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst), + (KMOVQmk addr:$dst, VK64:$src)>; + def : Pat<(v64i1 (bitconvert (i64 (load addr:$src)))), + (KMOVQkm addr:$src)>; +} + +let Predicates = [HasAVX512] in { + def : Pat<(i1 (trunc (i64 GR64:$src))), + (COPY_TO_REGCLASS (KMOVWkr (AND32ri (EXTRACT_SUBREG $src, sub_32bit), + (i32 1))), VK1)>; + + def : Pat<(i1 (trunc (i32 GR32:$src))), + (COPY_TO_REGCLASS (KMOVWkr (AND32ri $src, (i32 1))), VK1)>; + + def : Pat<(i1 (trunc (i8 GR8:$src))), + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit), (i32 1))), + VK1)>; + def : Pat<(i1 (trunc (i16 GR16:$src))), + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri (SUBREG_TO_REG (i32 0), $src, sub_16bit), (i32 1))), + VK1)>; + + def : Pat<(i32 (zext VK1:$src)), + (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>; + def : Pat<(i32 (anyext VK1:$src)), + (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16))>; + + def : Pat<(i8 (zext VK1:$src)), + (EXTRACT_SUBREG + (AND32ri (KMOVWrk + (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>; + def : Pat<(i8 (anyext VK1:$src)), + (EXTRACT_SUBREG + (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_8bit)>; + + def : Pat<(i64 (zext VK1:$src)), + (AND64ri8 (SUBREG_TO_REG (i64 0), + (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>; + def : Pat<(i16 (zext VK1:$src)), + (EXTRACT_SUBREG + (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), + sub_16bit)>; +} +def : Pat<(v16i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK16)>; +def : Pat<(v8i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK8)>; +def : Pat<(v4i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK4)>; +def : Pat<(v2i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK2)>; +def : Pat<(v32i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK32)>; +def : Pat<(v64i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK64)>; + + +// With AVX-512 only, 8-bit mask is promoted to 16-bit mask. +let Predicates = [HasAVX512, NoDQI] in { + // GR from/to 8-bit mask without native support + def : Pat<(v8i1 (bitconvert (i8 GR8:$src))), + (COPY_TO_REGCLASS + (KMOVWkr (MOVZX32rr8 GR8 :$src)), VK8)>; + def : Pat<(i8 (bitconvert (v8i1 VK8:$src))), + (EXTRACT_SUBREG + (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), + sub_8bit)>; +} + +let Predicates = [HasAVX512] in { + def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK16:$src, VK1)>; + def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK8:$src, VK1)>; +} +let Predicates = [HasBWI] in { + def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK32:$src, VK1)>; + def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK64:$src, VK1)>; +} + +// Mask unary operation +// - KNOT +multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr, + RegisterClass KRC, SDPatternOperator OpNode, + Predicate prd> { + let Predicates = [prd] in + def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set KRC:$dst, (OpNode KRC:$src))]>; +} + +multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode> { + defm B : avx512_mask_unop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode, + HasDQI>, VEX, PD; + defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode, + HasAVX512>, VEX, PS; + defm D : avx512_mask_unop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode, + HasBWI>, VEX, PD, VEX_W; + defm Q : avx512_mask_unop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode, + HasBWI>, VEX, PS, VEX_W; +} + +defm KNOT : avx512_mask_unop_all<0x44, "knot", not>; + +multiclass avx512_mask_unop_int<string IntName, string InstName> { + let Predicates = [HasAVX512] in + def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w") + (i16 GR16:$src)), + (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr") + (v16i1 (COPY_TO_REGCLASS GR16:$src, VK16))), GR16)>; +} +defm : avx512_mask_unop_int<"knot", "KNOT">; + +let Predicates = [HasDQI] in +def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)), (KNOTBrr VK8:$src1)>; +let Predicates = [HasAVX512] in +def : Pat<(xor VK16:$src1, (v16i1 immAllOnesV)), (KNOTWrr VK16:$src1)>; +let Predicates = [HasBWI] in +def : Pat<(xor VK32:$src1, (v32i1 immAllOnesV)), (KNOTDrr VK32:$src1)>; +let Predicates = [HasBWI] in +def : Pat<(xor VK64:$src1, (v64i1 immAllOnesV)), (KNOTQrr VK64:$src1)>; + +// KNL does not support KMOVB, 8-bit mask is promoted to 16-bit +let Predicates = [HasAVX512, NoDQI] in { +def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)), + (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>; +def : Pat<(not VK8:$src), + (COPY_TO_REGCLASS + (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>; +} +def : Pat<(xor VK4:$src1, (v4i1 immAllOnesV)), + (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK4:$src1, VK16)), VK4)>; +def : Pat<(xor VK2:$src1, (v2i1 immAllOnesV)), + (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK2:$src1, VK16)), VK2)>; + +// Mask binary operation +// - KAND, KANDN, KOR, KXNOR, KXOR +multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr, + RegisterClass KRC, SDPatternOperator OpNode, + Predicate prd, bit IsCommutable> { + let Predicates = [prd], isCommutable = IsCommutable in + def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>; +} + +multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode, bit IsCommutable, + Predicate prdW = HasAVX512> { + defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode, + HasDQI, IsCommutable>, VEX_4V, VEX_L, PD; + defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode, + prdW, IsCommutable>, VEX_4V, VEX_L, PS; + defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode, + HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD; + defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode, + HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS; +} + +def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>; +def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>; + +defm KAND : avx512_mask_binop_all<0x41, "kand", and, 1>; +defm KOR : avx512_mask_binop_all<0x45, "kor", or, 1>; +defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", xnor, 1>; +defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, 1>; +defm KANDN : avx512_mask_binop_all<0x42, "kandn", andn, 0>; +defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, 1, HasDQI>; + +multiclass avx512_mask_binop_int<string IntName, string InstName> { + let Predicates = [HasAVX512] in + def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w") + (i16 GR16:$src1), (i16 GR16:$src2)), + (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr") + (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)), + (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>; +} + +defm : avx512_mask_binop_int<"kand", "KAND">; +defm : avx512_mask_binop_int<"kandn", "KANDN">; +defm : avx512_mask_binop_int<"kor", "KOR">; +defm : avx512_mask_binop_int<"kxnor", "KXNOR">; +defm : avx512_mask_binop_int<"kxor", "KXOR">; + +multiclass avx512_binop_pat<SDPatternOperator OpNode, Instruction Inst> { + // With AVX512F, 8-bit mask is promoted to 16-bit mask, + // for the DQI set, this type is legal and KxxxB instruction is used + let Predicates = [NoDQI] in + def : Pat<(OpNode VK8:$src1, VK8:$src2), + (COPY_TO_REGCLASS + (Inst (COPY_TO_REGCLASS VK8:$src1, VK16), + (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>; + + // All types smaller than 8 bits require conversion anyway + def : Pat<(OpNode VK1:$src1, VK1:$src2), + (COPY_TO_REGCLASS (Inst + (COPY_TO_REGCLASS VK1:$src1, VK16), + (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; + def : Pat<(OpNode VK2:$src1, VK2:$src2), + (COPY_TO_REGCLASS (Inst + (COPY_TO_REGCLASS VK2:$src1, VK16), + (COPY_TO_REGCLASS VK2:$src2, VK16)), VK1)>; + def : Pat<(OpNode VK4:$src1, VK4:$src2), + (COPY_TO_REGCLASS (Inst + (COPY_TO_REGCLASS VK4:$src1, VK16), + (COPY_TO_REGCLASS VK4:$src2, VK16)), VK1)>; +} + +defm : avx512_binop_pat<and, KANDWrr>; +defm : avx512_binop_pat<andn, KANDNWrr>; +defm : avx512_binop_pat<or, KORWrr>; +defm : avx512_binop_pat<xnor, KXNORWrr>; +defm : avx512_binop_pat<xor, KXORWrr>; + +def : Pat<(xor (xor VK16:$src1, VK16:$src2), (v16i1 immAllOnesV)), + (KXNORWrr VK16:$src1, VK16:$src2)>; +def : Pat<(xor (xor VK8:$src1, VK8:$src2), (v8i1 immAllOnesV)), + (KXNORBrr VK8:$src1, VK8:$src2)>, Requires<[HasDQI]>; +def : Pat<(xor (xor VK32:$src1, VK32:$src2), (v32i1 immAllOnesV)), + (KXNORDrr VK32:$src1, VK32:$src2)>, Requires<[HasBWI]>; +def : Pat<(xor (xor VK64:$src1, VK64:$src2), (v64i1 immAllOnesV)), + (KXNORQrr VK64:$src1, VK64:$src2)>, Requires<[HasBWI]>; + +let Predicates = [NoDQI] in +def : Pat<(xor (xor VK8:$src1, VK8:$src2), (v8i1 immAllOnesV)), + (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK8:$src1, VK16), + (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>; + +def : Pat<(xor (xor VK4:$src1, VK4:$src2), (v4i1 immAllOnesV)), + (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK4:$src1, VK16), + (COPY_TO_REGCLASS VK4:$src2, VK16)), VK4)>; + +def : Pat<(xor (xor VK2:$src1, VK2:$src2), (v2i1 immAllOnesV)), + (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK2:$src1, VK16), + (COPY_TO_REGCLASS VK2:$src2, VK16)), VK2)>; + +def : Pat<(xor (xor VK1:$src1, VK1:$src2), (i1 1)), + (COPY_TO_REGCLASS (KXNORWrr (COPY_TO_REGCLASS VK1:$src1, VK16), + (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; + +// Mask unpacking +multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT, + RegisterClass KRCSrc, Predicate prd> { + let Predicates = [prd] in { + let hasSideEffects = 0 in + def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst), + (ins KRC:$src1, KRC:$src2), + "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + VEX_4V, VEX_L; + + def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)), + (!cast<Instruction>(NAME##rr) + (COPY_TO_REGCLASS KRCSrc:$src2, KRC), + (COPY_TO_REGCLASS KRCSrc:$src1, KRC))>; + } +} + +defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, HasAVX512>, PD; +defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, HasBWI>, PS; +defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, HasBWI>, PS, VEX_W; + +// Mask bit testing +multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC, + SDNode OpNode, Predicate prd> { + let Predicates = [prd], Defs = [EFLAGS] in + def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>; +} + +multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode, + Predicate prdW = HasAVX512> { + defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, HasDQI>, + VEX, PD; + defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, prdW>, + VEX, PS; + defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, HasBWI>, + VEX, PS, VEX_W; + defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, HasBWI>, + VEX, PD, VEX_W; +} + +defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>; +defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest, HasDQI>; + +// Mask shift +multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC, + SDNode OpNode> { + let Predicates = [HasAVX512] in + def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm), + !strconcat(OpcodeStr, + "\t{$imm, $src, $dst|$dst, $src, $imm}"), + [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>; +} + +multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr, + SDNode OpNode> { + defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>, + VEX, TAPD, VEX_W; + let Predicates = [HasDQI] in + defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode>, + VEX, TAPD; + let Predicates = [HasBWI] in { + defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode>, + VEX, TAPD, VEX_W; + let Predicates = [HasDQI] in + defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode>, + VEX, TAPD; + } +} + +defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>; +defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86vsrli>; + +// Mask setting all 0s or 1s +multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> { + let Predicates = [HasAVX512] in + let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1 in + def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "", + [(set KRC:$dst, (VT Val))]>; +} + +multiclass avx512_mask_setop_w<PatFrag Val> { + defm B : avx512_mask_setop<VK8, v8i1, Val>; + defm W : avx512_mask_setop<VK16, v16i1, Val>; + defm D : avx512_mask_setop<VK32, v32i1, Val>; + defm Q : avx512_mask_setop<VK64, v64i1, Val>; +} + +defm KSET0 : avx512_mask_setop_w<immAllZerosV>; +defm KSET1 : avx512_mask_setop_w<immAllOnesV>; + +// With AVX-512 only, 8-bit mask is promoted to 16-bit mask. +let Predicates = [HasAVX512] in { + def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>; + def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>; + def : Pat<(v4i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK4)>; + def : Pat<(v2i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK2)>; + def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>; + def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>; + def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>; +} +def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK16:$src, VK8))>; + +def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))), + (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16))>; + +def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), + (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>; + +def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 0))), + (v16i1 (COPY_TO_REGCLASS VK32:$src, VK16))>; + +def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))), + (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>; + +def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))), + (v32i1 (COPY_TO_REGCLASS VK64:$src, VK32))>; + +def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))), + (v32i1 (COPY_TO_REGCLASS (KSHIFTRQri VK64:$src, (i8 32)), VK32))>; + +def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), + (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>; + +def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), + (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>; + +def : Pat<(v4i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), + (v4i1 (COPY_TO_REGCLASS VK2:$src, VK4))>; + +def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>; +def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>; + +def : Pat<(v32i1 (insert_subvector undef, VK2:$src, (iPTR 0))), + (v32i1 (COPY_TO_REGCLASS VK2:$src, VK32))>; +def : Pat<(v32i1 (insert_subvector undef, VK4:$src, (iPTR 0))), + (v32i1 (COPY_TO_REGCLASS VK4:$src, VK32))>; +def : Pat<(v32i1 (insert_subvector undef, VK8:$src, (iPTR 0))), + (v32i1 (COPY_TO_REGCLASS VK8:$src, VK32))>; +def : Pat<(v32i1 (insert_subvector undef, VK16:$src, (iPTR 0))), + (v32i1 (COPY_TO_REGCLASS VK16:$src, VK32))>; + +def : Pat<(v64i1 (insert_subvector undef, VK2:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK2:$src, VK64))>; +def : Pat<(v64i1 (insert_subvector undef, VK4:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK4:$src, VK64))>; +def : Pat<(v64i1 (insert_subvector undef, VK8:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK8:$src, VK64))>; +def : Pat<(v64i1 (insert_subvector undef, VK16:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK16:$src, VK64))>; +def : Pat<(v64i1 (insert_subvector undef, VK32:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK32:$src, VK64))>; + + +def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))), + (v8i1 (COPY_TO_REGCLASS + (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16), + (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>; + +def : Pat<(v8i1 (X86vsrli VK8:$src, (i8 imm:$imm))), + (v8i1 (COPY_TO_REGCLASS + (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), + (I8Imm $imm)), VK8))>, Requires<[HasAVX512, NoDQI]>; + +def : Pat<(v4i1 (X86vshli VK4:$src, (i8 imm:$imm))), + (v4i1 (COPY_TO_REGCLASS + (KSHIFTLWri (COPY_TO_REGCLASS VK4:$src, VK16), + (I8Imm $imm)), VK4))>, Requires<[HasAVX512]>; + +def : Pat<(v4i1 (X86vsrli VK4:$src, (i8 imm:$imm))), + (v4i1 (COPY_TO_REGCLASS + (KSHIFTRWri (COPY_TO_REGCLASS VK4:$src, VK16), + (I8Imm $imm)), VK4))>, Requires<[HasAVX512]>; + +//===----------------------------------------------------------------------===// +// AVX-512 - Aligned and unaligned load and store +// + + +multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + PatFrag ld_frag, PatFrag mload, + bit IsReMaterializable = 1> { + let hasSideEffects = 0 in { + def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], + _.ExeDomain>, EVEX; + def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src), + !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", + "${dst} {${mask}} {z}, $src}"), [], _.ExeDomain>, + EVEX, EVEX_KZ; + + let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable, + SchedRW = [WriteLoad] in + def rm : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.MemOp:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set _.RC:$dst, (_.VT (bitconvert (ld_frag addr:$src))))], + _.ExeDomain>, EVEX; + + let Constraints = "$src0 = $dst" in { + def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1), + !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", + "${dst} {${mask}}, $src1}"), + [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask, + (_.VT _.RC:$src1), + (_.VT _.RC:$src0))))], _.ExeDomain>, + EVEX, EVEX_K; + let mayLoad = 1, SchedRW = [WriteLoad] in + def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1), + !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", + "${dst} {${mask}}, $src1}"), + [(set _.RC:$dst, (_.VT + (vselect _.KRCWM:$mask, + (_.VT (bitconvert (ld_frag addr:$src1))), + (_.VT _.RC:$src0))))], _.ExeDomain>, EVEX, EVEX_K; + } + let mayLoad = 1, SchedRW = [WriteLoad] in + def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.MemOp:$src), + OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"# + "${dst} {${mask}} {z}, $src}", + [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask, + (_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))], + _.ExeDomain>, EVEX, EVEX_KZ; + } + def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)), + (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>; + + def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, _.ImmAllZerosV)), + (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>; + + def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src0))), + (!cast<Instruction>(NAME#_.ZSuffix##rmk) _.RC:$src0, + _.KRCWM:$mask, addr:$ptr)>; +} + +multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, + Predicate prd, + bit IsReMaterializable = 1> { + let Predicates = [prd] in + defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.AlignedLdFrag, + masked_load_aligned512, IsReMaterializable>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.AlignedLdFrag, + masked_load_aligned256, IsReMaterializable>, EVEX_V256; + defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.AlignedLdFrag, + masked_load_aligned128, IsReMaterializable>, EVEX_V128; + } +} + +multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, + Predicate prd, + bit IsReMaterializable = 1> { + let Predicates = [prd] in + defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.LdFrag, + masked_load_unaligned, IsReMaterializable>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.LdFrag, + masked_load_unaligned, IsReMaterializable>, EVEX_V256; + defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.LdFrag, + masked_load_unaligned, IsReMaterializable>, EVEX_V128; + } +} + +multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + PatFrag st_frag, PatFrag mstore> { + + def rr_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src), + OpcodeStr # ".s\t{$src, $dst|$dst, $src}", + [], _.ExeDomain>, EVEX; + def rrk_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src), + OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"# + "${dst} {${mask}}, $src}", + [], _.ExeDomain>, EVEX, EVEX_K; + def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src), + OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" # + "${dst} {${mask}} {z}, $src}", + [], _.ExeDomain>, EVEX, EVEX_KZ; + + let mayStore = 1 in { + def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(st_frag (_.VT _.RC:$src), addr:$dst)], _.ExeDomain>, EVEX; + def mrk : AVX512PI<opc, MRMDestMem, (outs), + (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src), + OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}", + [], _.ExeDomain>, EVEX, EVEX_K; + } + + def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)), + (!cast<Instruction>(NAME#_.ZSuffix##mrk) addr:$ptr, + _.KRCWM:$mask, _.RC:$src)>; +} + + +multiclass avx512_store_vl< bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_store<opc, OpcodeStr, _.info512, store, + masked_store_unaligned>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_store<opc, OpcodeStr, _.info256, store, + masked_store_unaligned>, EVEX_V256; + defm Z128 : avx512_store<opc, OpcodeStr, _.info128, store, + masked_store_unaligned>, EVEX_V128; + } +} + +multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_store<opc, OpcodeStr, _.info512, alignedstore512, + masked_store_aligned512>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_store<opc, OpcodeStr, _.info256, alignedstore256, + masked_store_aligned256>, EVEX_V256; + defm Z128 : avx512_store<opc, OpcodeStr, _.info128, alignedstore, + masked_store_aligned128>, EVEX_V128; + } +} + +defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info, + HasAVX512>, + avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info, + HasAVX512>, PS, EVEX_CD8<32, CD8VF>; + +defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info, + HasAVX512>, + avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info, + HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512>, + avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512>, + PS, EVEX_CD8<32, CD8VF>; + +defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, 0>, + avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>, + PD, VEX_W, EVEX_CD8<64, CD8VF>; + +def: Pat<(v8f64 (int_x86_avx512_mask_loadu_pd_512 addr:$ptr, + (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)), + (VMOVUPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; + +def: Pat<(v16f32 (int_x86_avx512_mask_loadu_ps_512 addr:$ptr, + (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)), + (VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; + +def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr, + (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)), + (VMOVAPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; + +def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr, + (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)), + (VMOVAPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; + +def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr, + (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))), + (VMOVAPDZrm addr:$ptr)>; + +def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr, + (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))), + (VMOVAPSZrm addr:$ptr)>; + +def: Pat<(int_x86_avx512_mask_storeu_ps_512 addr:$ptr, (v16f32 VR512:$src), + GR16:$mask), + (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), + VR512:$src)>; +def: Pat<(int_x86_avx512_mask_storeu_pd_512 addr:$ptr, (v8f64 VR512:$src), + GR8:$mask), + (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), + VR512:$src)>; + +def: Pat<(int_x86_avx512_mask_store_ps_512 addr:$ptr, (v16f32 VR512:$src), + GR16:$mask), + (VMOVAPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), + VR512:$src)>; +def: Pat<(int_x86_avx512_mask_store_pd_512 addr:$ptr, (v8f64 VR512:$src), + GR8:$mask), + (VMOVAPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), + VR512:$src)>; + +defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info, + HasAVX512>, + avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info, + HasAVX512>, PD, EVEX_CD8<32, CD8VF>; + +defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info, + HasAVX512>, + avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info, + HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>, + avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, + HasBWI>, XD, EVEX_CD8<8, CD8VF>; + +defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>, + avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, + HasBWI>, XD, VEX_W, EVEX_CD8<16, CD8VF>; + +defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512>, + avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, + HasAVX512>, XS, EVEX_CD8<32, CD8VF>; + +defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512>, + avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, + HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>; + +def: Pat<(v16i32 (int_x86_avx512_mask_loadu_d_512 addr:$ptr, + (v16i32 immAllZerosV), GR16:$mask)), + (VMOVDQU32Zrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; + +def: Pat<(v8i64 (int_x86_avx512_mask_loadu_q_512 addr:$ptr, + (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)), + (VMOVDQU64Zrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; + +def: Pat<(int_x86_avx512_mask_storeu_d_512 addr:$ptr, (v16i32 VR512:$src), + GR16:$mask), + (VMOVDQU32Zmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), + VR512:$src)>; +def: Pat<(int_x86_avx512_mask_storeu_q_512 addr:$ptr, (v8i64 VR512:$src), + GR8:$mask), + (VMOVDQU64Zmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), + VR512:$src)>; + +let AddedComplexity = 20 in { +def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src), + (bc_v8i64 (v16i32 immAllZerosV)))), + (VMOVDQU64Zrrkz VK8WM:$mask, VR512:$src)>; + +def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)), + (v8i64 VR512:$src))), + (VMOVDQU64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)), + VK8), VR512:$src)>; + +def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src), + (v16i32 immAllZerosV))), + (VMOVDQU32Zrrkz VK16WM:$mask, VR512:$src)>; + +def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV), + (v16i32 VR512:$src))), + (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>; +} + +// Move Int Doubleword to Packed Double Int +// +def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, + (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, + EVEX; +def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, + (v4i32 (scalar_to_vector (loadi32 addr:$src))))], + IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; +def VMOV64toPQIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, + (v2i64 (scalar_to_vector GR64:$src)))], + IIC_SSE_MOVDQ>, EVEX, VEX_W; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in +def VMOV64toPQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), + (ins i64mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", []>, + EVEX, VEX_W, EVEX_CD8<64, CD8VT1>; +let isCodeGenOnly = 1 in { +def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set FR64X:$dst, (bitconvert GR64:$src))], + IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>; +def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (bitconvert FR64X:$src))], + IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>; +def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(store (i64 (bitconvert FR64X:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>, + EVEX_CD8<64, CD8VT1>; +} + +// Move Int Doubleword to Single Scalar +// +let isCodeGenOnly = 1 in { +def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(set FR32X:$dst, (bitconvert GR32:$src))], + IIC_SSE_MOVDQ>, EVEX; + +def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))], + IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; +} + +// Move doubleword from xmm register to r/m32 +// +def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (extractelt (v4i32 VR128X:$src), + (iPTR 0)))], IIC_SSE_MOVD_ToGP>, + EVEX; +def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs), + (ins i32mem:$dst, VR128X:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(store (i32 (extractelt (v4i32 VR128X:$src), + (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, + EVEX, EVEX_CD8<32, CD8VT1>; + +// Move quadword from xmm1 register to r/m64 +// +def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (extractelt (v2i64 VR128X:$src), + (iPTR 0)))], + IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W, + Requires<[HasAVX512, In64BitMode]>; + +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in +def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W, + Requires<[HasAVX512, In64BitMode]>; + +def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs), + (ins i64mem:$dst, VR128X:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)), + addr:$dst)], IIC_SSE_MOVDQ>, + EVEX, PD, VEX_W, EVEX_CD8<64, CD8VT1>, + Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>; + +let hasSideEffects = 0 in +def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst), + (ins VR128X:$src), + "vmovq.s\t{$src, $dst|$dst, $src}",[]>, + EVEX, VEX_W; + +// Move Scalar Single to Double Int +// +let isCodeGenOnly = 1 in { +def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), + (ins FR32X:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (bitconvert FR32X:$src))], + IIC_SSE_MOVD_ToGP>, EVEX; +def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs), + (ins i32mem:$dst, FR32X:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(store (i32 (bitconvert FR32X:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; +} + +// Move Quadword Int to Packed Quadword Int +// +def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst), + (ins i64mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, + (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, + EVEX, VEX_W, EVEX_CD8<8, CD8VT8>; + +//===----------------------------------------------------------------------===// +// AVX-512 MOVSS, MOVSD +//===----------------------------------------------------------------------===// + +multiclass avx512_move_scalar <string asm, SDNode OpNode, + X86VectorVTInfo _> { + defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), + asm, "$src2, $src1","$src1, $src2", + (_.VT (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2))), + IIC_SSE_MOV_S_RR>, EVEX_4V; + let Constraints = "$src1 = $dst" , mayLoad = 1 in + defm rm_Int : AVX512_maskable_3src_scalar<0x10, MRMSrcMem, _, + (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), + asm,"$src","$src", + (_.VT (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector + (_.ScalarLdFrag addr:$src)))))>, EVEX; + let isCodeGenOnly = 1 in { + def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, _.FRC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, + (scalar_to_vector _.FRC:$src2))))], + _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V; + let mayLoad = 1 in + def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))], + _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX; + } + let mayStore = 1 in { + def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(store _.FRC:$src, addr:$dst)], _.ExeDomain, IIC_SSE_MOV_S_MR>, + EVEX; + def mrk: AVX512PI<0x11, MRMDestMem, (outs), + (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src), + !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), + [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K; + } // mayStore +} + +defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>, + VEX_LIG, XS, EVEX_CD8<32, CD8VT1>; + +defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>, + VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>; + +def : Pat<(f32 (X86select VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))), + (COPY_TO_REGCLASS (VMOVSSZrr_Intk (COPY_TO_REGCLASS FR32X:$src2, VR128X), + VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>; + +def : Pat<(f64 (X86select VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))), + (COPY_TO_REGCLASS (VMOVSDZrr_Intk (COPY_TO_REGCLASS FR64X:$src2, VR128X), + VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>; + +def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), + (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)), + (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + +defm VMOVSSZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f32x_info, + (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2), + "vmovss.s", "$src2, $src1", "$src1, $src2", []>, + XS, EVEX_4V, VEX_LIG; + +defm VMOVSSDrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info, + (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2), + "vmovsd.s", "$src2, $src1", "$src1, $src2", []>, + XD, EVEX_4V, VEX_LIG, VEX_W; + +let Predicates = [HasAVX512] in { + let AddedComplexity = 15 in { + // Move scalar to XMM zero-extended, zeroing a VR128X then do a + // MOVS{S,D} to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32X:$src)))), + (VMOVSSZrr (v4f32 (V_SET0)), FR32X:$src)>; + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))), + (VMOVSSZrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))), + (VMOVSSZrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))), + (VMOVSDZrr (v2f64 (V_SET0)), FR64X:$src)>; + + // Move low f32 and clear high bits. + def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSSZrr (v4f32 (V_SET0)), + (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>; + def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSSZrr (v4i32 (V_SET0)), + (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>; + } + + let AddedComplexity = 20 in { + // MOVSSrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), + (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; + def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), + (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; + def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), + (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; + + // MOVSDrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), + (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; + def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), + (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; + def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), + (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; + def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), + (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; + def : Pat<(v2f64 (X86vzload addr:$src)), + (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; + + // Represent the same patterns above but in the form they appear for + // 256-bit types + def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>; + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; + } + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (V_SET0)), + FR32X:$src)), sub_xmm)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector FR64X:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (V_SET0)), + FR64X:$src)), sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>; + + // Move low f64 and clear high bits. + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSDZrr (v2f64 (V_SET0)), + (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>; + + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), + (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)), + (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>; + + // Extract and store. + def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))), + addr:$dst), + (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>; + def : Pat<(store (f64 (extractelt (v2f64 VR128X:$src), (iPTR 0))), + addr:$dst), + (VMOVSDZmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128X:$src), FR64X))>; + + // Shuffle with VMOVSS + def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)), + (VMOVSSZrr (v4i32 VR128X:$src1), + (COPY_TO_REGCLASS (v4i32 VR128X:$src2), FR32X))>; + def : Pat<(v4f32 (X86Movss VR128X:$src1, VR128X:$src2)), + (VMOVSSZrr (v4f32 VR128X:$src1), + (COPY_TO_REGCLASS (v4f32 VR128X:$src2), FR32X))>; + + // 256-bit variants + def : Pat<(v8i32 (X86Movss VR256X:$src1, VR256X:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSSZrr (EXTRACT_SUBREG (v8i32 VR256X:$src1), sub_xmm), + (EXTRACT_SUBREG (v8i32 VR256X:$src2), sub_xmm)), + sub_xmm)>; + def : Pat<(v8f32 (X86Movss VR256X:$src1, VR256X:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSSZrr (EXTRACT_SUBREG (v8f32 VR256X:$src1), sub_xmm), + (EXTRACT_SUBREG (v8f32 VR256X:$src2), sub_xmm)), + sub_xmm)>; + + // Shuffle with VMOVSD + def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + def : Pat<(v2f64 (X86Movsd VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + def : Pat<(v4f32 (X86Movsd VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + def : Pat<(v4i32 (X86Movsd VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + + // 256-bit variants + def : Pat<(v4i64 (X86Movsd VR256X:$src1, VR256X:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSDZrr (EXTRACT_SUBREG (v4i64 VR256X:$src1), sub_xmm), + (EXTRACT_SUBREG (v4i64 VR256X:$src2), sub_xmm)), + sub_xmm)>; + def : Pat<(v4f64 (X86Movsd VR256X:$src1, VR256X:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSDZrr (EXTRACT_SUBREG (v4f64 VR256X:$src1), sub_xmm), + (EXTRACT_SUBREG (v4f64 VR256X:$src2), sub_xmm)), + sub_xmm)>; + + def : Pat<(v2f64 (X86Movlpd VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + def : Pat<(v2i64 (X86Movlpd VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + def : Pat<(v4f32 (X86Movlps VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + def : Pat<(v4i32 (X86Movlps VR128X:$src1, VR128X:$src2)), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; +} + +let AddedComplexity = 15 in +def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, (v2i64 (X86vzmovl + (v2i64 VR128X:$src))))], + IIC_SSE_MOVQ_RR>, EVEX, VEX_W; + +let AddedComplexity = 20 , isCodeGenOnly = 1 in +def VMOVZPQILo2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst), + (ins i128mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, (v2i64 (X86vzmovl + (loadv2i64 addr:$src))))], + IIC_SSE_MOVDQ>, EVEX, VEX_W, + EVEX_CD8<8, CD8VT8>; + +let Predicates = [HasAVX512] in { + // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. + let AddedComplexity = 20 in { + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + (VMOVDI2PDIZrm addr:$src)>; + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), + (VMOV64toPQIZrr GR64:$src)>; + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), + (VMOVDI2PDIZrr GR32:$src)>; + + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), + (VMOVDI2PDIZrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + (VMOVDI2PDIZrm addr:$src)>; + def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), + (VMOVZPQILo2PQIZrm addr:$src)>; + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))), + (VMOVZPQILo2PQIZrr VR128X:$src)>; + def : Pat<(v2i64 (X86vzload addr:$src)), + (VMOVZPQILo2PQIZrm addr:$src)>; + } + + // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. + def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>; +} + +def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>; + +def : Pat<(v8i64 (X86Vinsert (bc_v8i64 (v16i32 immAllZerosV)), GR64:$src2, (iPTR 0))), + (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>; + +def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>; + +def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), + (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>; + +//===----------------------------------------------------------------------===// +// AVX-512 - Non-temporals +//===----------------------------------------------------------------------===// +let SchedRW = [WriteLoad] in { + def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst), + (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", + [(set VR512:$dst, (int_x86_avx512_movntdqa addr:$src))], + SSEPackedInt>, EVEX, T8PD, EVEX_V512, + EVEX_CD8<64, CD8VF>; + + let Predicates = [HasAVX512, HasVLX] in { + def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst), + (ins i256mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", [], + SSEPackedInt>, EVEX, T8PD, EVEX_V256, + EVEX_CD8<64, CD8VF>; + + def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst), + (ins i128mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", [], + SSEPackedInt>, EVEX, T8PD, EVEX_V128, + EVEX_CD8<64, CD8VF>; + } +} + +multiclass avx512_movnt<bits<8> opc, string OpcodeStr, PatFrag st_frag, + ValueType OpVT, RegisterClass RC, X86MemOperand memop, + Domain d, InstrItinClass itin = IIC_SSE_MOVNT> { + let SchedRW = [WriteStore], mayStore = 1, + AddedComplexity = 400 in + def mr : AVX512PI<opc, MRMDestMem, (outs), (ins memop:$dst, RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(st_frag (OpVT RC:$src), addr:$dst)], d, itin>, EVEX; +} + +multiclass avx512_movnt_vl<bits<8> opc, string OpcodeStr, PatFrag st_frag, + string elty, string elsz, string vsz512, + string vsz256, string vsz128, Domain d, + Predicate prd, InstrItinClass itin = IIC_SSE_MOVNT> { + let Predicates = [prd] in + defm Z : avx512_movnt<opc, OpcodeStr, st_frag, + !cast<ValueType>("v"##vsz512##elty##elsz), VR512, + !cast<X86MemOperand>(elty##"512mem"), d, itin>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_movnt<opc, OpcodeStr, st_frag, + !cast<ValueType>("v"##vsz256##elty##elsz), VR256X, + !cast<X86MemOperand>(elty##"256mem"), d, itin>, + EVEX_V256; + + defm Z128 : avx512_movnt<opc, OpcodeStr, st_frag, + !cast<ValueType>("v"##vsz128##elty##elsz), VR128X, + !cast<X86MemOperand>(elty##"128mem"), d, itin>, + EVEX_V128; + } +} + +defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", alignednontemporalstore, + "i", "64", "8", "4", "2", SSEPackedInt, + HasAVX512>, PD, EVEX_CD8<64, CD8VF>; + +defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", alignednontemporalstore, + "f", "64", "8", "4", "2", SSEPackedDouble, + HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", alignednontemporalstore, + "f", "32", "16", "8", "4", SSEPackedSingle, + HasAVX512>, PS, EVEX_CD8<32, CD8VF>; + +//===----------------------------------------------------------------------===// +// AVX-512 - Integer arithmetic +// +multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, OpndItins itins, + bit IsCommutable = 0> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, _.RC:$src2)), + itins.rr, IsCommutable>, + AVX512BIBase, EVEX_4V; + + let mayLoad = 1 in + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, + (bitconvert (_.LdFrag addr:$src2)))), + itins.rm>, + AVX512BIBase, EVEX_4V; +} + +multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, OpndItins itins, + bit IsCommutable = 0> : + avx512_binop_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> { + let mayLoad = 1 in + defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (_.VT (OpNode _.RC:$src1, + (X86VBroadcast + (_.ScalarLdFrag addr:$src2)))), + itins.rm>, + AVX512BIBase, EVEX_4V, EVEX_B; +} + +multiclass avx512_binop_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, OpndItins itins, + Predicate prd, bit IsCommutable = 0> { + let Predicates = [prd] in + defm Z : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info512, itins, + IsCommutable>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info256, itins, + IsCommutable>, EVEX_V256; + defm Z128 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info128, itins, + IsCommutable>, EVEX_V128; + } +} + +multiclass avx512_binop_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, OpndItins itins, + Predicate prd, bit IsCommutable = 0> { + let Predicates = [prd] in + defm Z : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, itins, + IsCommutable>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, itins, + IsCommutable>, EVEX_V256; + defm Z128 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, itins, + IsCommutable>, EVEX_V128; + } +} + +multiclass avx512_binop_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info, + itins, prd, IsCommutable>, + VEX_W, EVEX_CD8<64, CD8VF>; +} + +multiclass avx512_binop_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info, + itins, prd, IsCommutable>, EVEX_CD8<32, CD8VF>; +} + +multiclass avx512_binop_rm_vl_w<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i16_info, + itins, prd, IsCommutable>, EVEX_CD8<16, CD8VF>; +} + +multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i8_info, + itins, prd, IsCommutable>, EVEX_CD8<8, CD8VF>; +} + +multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr, + SDNode OpNode, OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd, + IsCommutable>; + + defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd, + IsCommutable>; +} + +multiclass avx512_binop_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr, + SDNode OpNode, OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr#"w", OpNode, itins, prd, + IsCommutable>; + + defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr#"b", OpNode, itins, prd, + IsCommutable>; +} + +multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w, + bits<8> opc_d, bits<8> opc_q, + string OpcodeStr, SDNode OpNode, + OpndItins itins, bit IsCommutable = 0> { + defm NAME : avx512_binop_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode, + itins, HasAVX512, IsCommutable>, + avx512_binop_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode, + itins, HasBWI, IsCommutable>; +} + +multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins, + SDNode OpNode,X86VectorVTInfo _Src, + X86VectorVTInfo _Dst, bit IsCommutable = 0> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, + "$src2, $src1","$src1, $src2", + (_Dst.VT (OpNode + (_Src.VT _Src.RC:$src1), + (_Src.VT _Src.RC:$src2))), + itins.rr, IsCommutable>, + AVX512BIBase, EVEX_4V; + let mayLoad = 1 in { + defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), + (bitconvert (_Src.LdFrag addr:$src2)))), + itins.rm>, + AVX512BIBase, EVEX_4V; + + defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src1, _Dst.ScalarMemOp:$src2), + OpcodeStr, + "${src2}"##_Dst.BroadcastStr##", $src1", + "$src1, ${src2}"##_Dst.BroadcastStr, + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert + (_Dst.VT (X86VBroadcast + (_Dst.ScalarLdFrag addr:$src2)))))), + itins.rm>, + AVX512BIBase, EVEX_4V, EVEX_B; + } +} + +defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add, + SSE_INTALU_ITINS_P, 1>; +defm VPSUB : avx512_binop_rm_vl_all<0xF8, 0xF9, 0xFA, 0xFB, "vpsub", sub, + SSE_INTALU_ITINS_P, 0>; +defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", X86adds, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", X86subs, + SSE_INTALU_ITINS_P, HasBWI, 0>; +defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", X86addus, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus, + SSE_INTALU_ITINS_P, HasBWI, 0>; +defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; +defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul, + SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD; +defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SSE_INTALU_ITINS_P, + HasBWI, 1>; +defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SSE_INTMUL_ITINS_P, + HasBWI, 1>; +defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SSE_INTMUL_ITINS_P, + HasBWI, 1>, T8PD; +defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg, + SSE_INTALU_ITINS_P, HasBWI, 1>; + +multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins, + SDNode OpNode, bit IsCommutable = 0> { + + defm NAME#Z : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode, + v16i32_info, v8i64_info, IsCommutable>, + EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; + let Predicates = [HasVLX] in { + defm NAME#Z256 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode, + v8i32x_info, v4i64x_info, IsCommutable>, + EVEX_V256, EVEX_CD8<64, CD8VF>, VEX_W; + defm NAME#Z128 : avx512_binop_rm2<opc, OpcodeStr, itins, OpNode, + v4i32x_info, v2i64x_info, IsCommutable>, + EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W; + } +} + +defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTALU_ITINS_P, + X86pmuldq, 1>,T8PD; +defm VPMULUDQ : avx512_binop_all<0xF4, "vpmuludq", SSE_INTMUL_ITINS_P, + X86pmuludq, 1>; + +multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _Src, X86VectorVTInfo _Dst> { + let mayLoad = 1 in { + defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), + OpcodeStr, + "${src2}"##_Src.BroadcastStr##", $src1", + "$src1, ${src2}"##_Src.BroadcastStr, + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert + (_Src.VT (X86VBroadcast + (_Src.ScalarLdFrag addr:$src2))))))>, + EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>; + } +} + +multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr, + SDNode OpNode,X86VectorVTInfo _Src, + X86VectorVTInfo _Dst> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, + "$src2, $src1","$src1, $src2", + (_Dst.VT (OpNode + (_Src.VT _Src.RC:$src1), + (_Src.VT _Src.RC:$src2)))>, + EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V; + let mayLoad = 1 in { + defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), + (bitconvert (_Src.LdFrag addr:$src2))))>, + EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>; + } +} + +multiclass avx512_packs_all_i32_i16<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i32_info, + v32i16_info>, + avx512_packs_rmb<opc, OpcodeStr, OpNode, v16i32_info, + v32i16_info>, EVEX_V512; + let Predicates = [HasVLX] in { + defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i32x_info, + v16i16x_info>, + avx512_packs_rmb<opc, OpcodeStr, OpNode, v8i32x_info, + v16i16x_info>, EVEX_V256; + defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v4i32x_info, + v8i16x_info>, + avx512_packs_rmb<opc, OpcodeStr, OpNode, v4i32x_info, + v8i16x_info>, EVEX_V128; + } +} +multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v32i16_info, + v64i8_info>, EVEX_V512; + let Predicates = [HasVLX] in { + defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i16x_info, + v32i8x_info>, EVEX_V256; + defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i16x_info, + v16i8x_info>, EVEX_V128; + } +} + +multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr, + SDNode OpNode, AVX512VLVectorVTInfo _Src, + AVX512VLVectorVTInfo _Dst> { + defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512, + _Dst.info512>, EVEX_V512; + let Predicates = [HasVLX] in { + defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256, + _Dst.info256>, EVEX_V256; + defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128, + _Dst.info128>, EVEX_V128; + } +} + +let Predicates = [HasBWI] in { + defm VPACKSSDW : avx512_packs_all_i32_i16<0x6B, "vpackssdw", X86Packss>, PD; + defm VPACKUSDW : avx512_packs_all_i32_i16<0x2b, "vpackusdw", X86Packus>, T8PD; + defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512BIBase, VEX_W; + defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase, VEX_W; + + defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw, + avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD; + defm VPMADDWD : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd, + avx512vl_i16_info, avx512vl_i32_info>, AVX512BIBase; +} + +defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", smax, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; + +defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", umax, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; + +defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", smin, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; + +defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", umin, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; +//===----------------------------------------------------------------------===// +// AVX-512 Logical Instructions +//===----------------------------------------------------------------------===// + +defm VPAND : avx512_binop_rm_vl_dq<0xDB, 0xDB, "vpand", and, + SSE_INTALU_ITINS_P, HasAVX512, 1>; +defm VPOR : avx512_binop_rm_vl_dq<0xEB, 0xEB, "vpor", or, + SSE_INTALU_ITINS_P, HasAVX512, 1>; +defm VPXOR : avx512_binop_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, + SSE_INTALU_ITINS_P, HasAVX512, 1>; +defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, + SSE_INTALU_ITINS_P, HasAVX512, 0>; + +//===----------------------------------------------------------------------===// +// AVX-512 FP arithmetic +//===----------------------------------------------------------------------===// +multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + SDNode OpNode, SDNode VecNode, OpndItins itins, + bit IsCommutable> { + + defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 FROUND_CURRENT)), + itins.rr, IsCommutable>; + + defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (VecNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), + (i32 FROUND_CURRENT)), + itins.rm, IsCommutable>; + let isCodeGenOnly = 1, isCommutable = IsCommutable, + Predicates = [HasAVX512] in { + def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))], + itins.rr>; + def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.FRC:$dst, (OpNode _.FRC:$src1, + (_.ScalarLdFrag addr:$src2)))], itins.rr>; + } +} + +multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + SDNode VecNode, OpndItins itins, bit IsCommutable = 0> { + + defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, + "$rc, $src2, $src1", "$src1, $src2, $rc", + (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 imm:$rc)), itins.rr, IsCommutable>, + EVEX_B, EVEX_RC; +} +multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + SDNode VecNode, OpndItins itins, bit IsCommutable> { + + defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "{sae}, $src2, $src1", "$src1, $src2, {sae}", + (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 FROUND_NO_EXC))>, EVEX_B; +} + +multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode VecNode, + SizeItins itins, bit IsCommutable> { + defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode, + itins.s, IsCommutable>, + avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, VecNode, + itins.s, IsCommutable>, + XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; + defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode, + itins.d, IsCommutable>, + avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, VecNode, + itins.d, IsCommutable>, + XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; +} + +multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode VecNode, + SizeItins itins, bit IsCommutable> { + defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode, + itins.s, IsCommutable>, + avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, VecNode, + itins.s, IsCommutable>, + XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; + defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode, + itins.d, IsCommutable>, + avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, VecNode, + itins.d, IsCommutable>, + XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; +} +defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnd, SSE_ALU_ITINS_S, 1>; +defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnd, SSE_ALU_ITINS_S, 1>; +defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnd, SSE_ALU_ITINS_S, 0>; +defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnd, SSE_ALU_ITINS_S, 0>; +defm VMIN : avx512_binop_s_sae <0x5D, "vmin", X86fmin, X86fminRnd, SSE_ALU_ITINS_S, 1>; +defm VMAX : avx512_binop_s_sae <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITINS_S, 1>; + +multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, bit IsCommutable> { + defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, _.RC:$src2))>, EVEX_4V; + let mayLoad = 1 in { + defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>, EVEX_4V; + defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (OpNode _.RC:$src1, (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2))))>, + EVEX_4V, EVEX_B; + }//let mayLoad = 1 +} + +multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, + X86VectorVTInfo _> { + defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix, + "$rc, $src2, $src1", "$src1, $src2, $rc", + (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc)))>, + EVEX_4V, EVEX_B, EVEX_RC; +} + + +multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, + X86VectorVTInfo _> { + defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + "{sae}, $src2, $src1", "$src1, $src2, {sae}", + (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC)))>, + EVEX_4V, EVEX_B; +} + +multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, + bit IsCommutable = 0> { + defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info, + IsCommutable>, EVEX_V512, PS, + EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f64_info, + IsCommutable>, EVEX_V512, PD, VEX_W, + EVEX_CD8<64, CD8VF>; + + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f32x_info, + IsCommutable>, EVEX_V128, PS, + EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f32x_info, + IsCommutable>, EVEX_V256, PS, + EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v2f64x_info, + IsCommutable>, EVEX_V128, PD, VEX_W, + EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f64x_info, + IsCommutable>, EVEX_V256, PD, VEX_W, + EVEX_CD8<64, CD8VF>; + } +} + +multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> { + defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info>, + EVEX_V512, PS, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info>, + EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>; +} + +multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> { + defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info>, + EVEX_V512, PS, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info>, + EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>; +} + +defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>, + avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>; +defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>, + avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd>; +defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>, + avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>; +defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>, + avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>; +defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, 1>, + avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd>; +defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, 1>, + avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd>; +let Predicates = [HasDQI] in { + defm VAND : avx512_fp_binop_p<0x54, "vand", X86fand, 1>; + defm VANDN : avx512_fp_binop_p<0x55, "vandn", X86fandn, 0>; + defm VOR : avx512_fp_binop_p<0x56, "vor", X86for, 1>; + defm VXOR : avx512_fp_binop_p<0x57, "vxor", X86fxor, 1>; +} + +multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>, EVEX_4V; + let mayLoad = 1 in { + defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>, EVEX_4V; + defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (OpNode _.RC:$src1, (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2))), (i32 FROUND_CURRENT))>, + EVEX_4V, EVEX_B; + }//let mayLoad = 1 +} + +multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>; + let mayLoad = 1 in { + defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>; + }//let mayLoad = 1 +} + +multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode> { + defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>, + avx512_fp_round_packed<opc, OpcodeStr, OpNode, v16f32_info>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>, + avx512_fp_round_packed<opc, OpcodeStr, OpNode, v8f64_info>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm SSZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNode, f32x_info>, + avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNode, SSE_ALU_ITINS_S.s>, + EVEX_4V,EVEX_CD8<32, CD8VT1>; + defm SDZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNode, f64x_info>, + avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNode, SSE_ALU_ITINS_S.d>, + EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; + + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f32x_info>, + EVEX_V128, EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f32x_info>, + EVEX_V256, EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v2f64x_info>, + EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f64x_info>, + EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>; + } +} +defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef>, T8PD; + +//===----------------------------------------------------------------------===// +// AVX-512 VPTESTM instructions +//===----------------------------------------------------------------------===// + +multiclass avx512_vptest<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, + EVEX_4V; + let mayLoad = 1 in + defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))))>, + EVEX_4V, + EVEX_CD8<_.EltSize, CD8VF>; +} + +multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let mayLoad = 1 in + defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (OpNode (_.VT _.RC:$src1), (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2))))>, + EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; +} +multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in + defm Z : avx512_vptest<opc, OpcodeStr, OpNode, _.info512>, + avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512; + + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, _.info256>, + avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256; + defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, _.info128>, + avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128; + } +} + +multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode, + avx512vl_i32_info>; + defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode, + avx512vl_i64_info>, VEX_W; +} + +multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + let Predicates = [HasBWI] in { + defm WZ: avx512_vptest<opc, OpcodeStr#"w", OpNode, v32i16_info>, + EVEX_V512, VEX_W; + defm BZ: avx512_vptest<opc, OpcodeStr#"b", OpNode, v64i8_info>, + EVEX_V512; + } + let Predicates = [HasVLX, HasBWI] in { + + defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, v16i16x_info>, + EVEX_V256, VEX_W; + defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, v8i16x_info>, + EVEX_V128, VEX_W; + defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, v32i8x_info>, + EVEX_V256; + defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, v16i8x_info>, + EVEX_V128; + } +} + +multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr, + SDNode OpNode> : + avx512_vptest_wb <opc_wb, OpcodeStr, OpNode>, + avx512_vptest_dq<opc_dq, OpcodeStr, OpNode>; + +defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86testm>, T8PD; +defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86testnm>, T8XS; + +def : Pat <(i16 (int_x86_avx512_mask_ptestm_d_512 (v16i32 VR512:$src1), + (v16i32 VR512:$src2), (i16 -1))), + (COPY_TO_REGCLASS (VPTESTMDZrr VR512:$src1, VR512:$src2), GR16)>; + +def : Pat <(i8 (int_x86_avx512_mask_ptestm_q_512 (v8i64 VR512:$src1), + (v8i64 VR512:$src2), (i8 -1))), + (COPY_TO_REGCLASS (VPTESTMQZrr VR512:$src1, VR512:$src2), GR8)>; + +//===----------------------------------------------------------------------===// +// AVX-512 Shift instructions +//===----------------------------------------------------------------------===// +multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst), + (ins _.RC:$src1, u8imm:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))), + SSE_INTSHIFT_ITINS_P.rr>; + let mayLoad = 1 in + defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), + (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i8 imm:$src2))), + SSE_INTSHIFT_ITINS_P.rm>; +} + +multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM, + string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let mayLoad = 1 in + defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr, + "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2", + (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2))), + SSE_INTSHIFT_ITINS_P.rm>, EVEX_B; +} + +multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { + // src2 is always 128-bit + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, VR128X:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2))), + SSE_INTSHIFT_ITINS_P.rr>, AVX512BIBase, EVEX_4V; + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, i128mem:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))), + SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase, + EVEX_4V; +} + +multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType SrcVT, PatFrag bc_frag, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, + VTInfo.info512>, EVEX_V512, + EVEX_CD8<VTInfo.info512.EltSize, CD8VQ> ; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, + VTInfo.info256>, EVEX_V256, + EVEX_CD8<VTInfo.info256.EltSize, CD8VH>; + defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, + VTInfo.info128>, EVEX_V128, + EVEX_CD8<VTInfo.info128.EltSize, CD8VF>; + } +} + +multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw, + string OpcodeStr, SDNode OpNode> { + defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, v4i32, bc_v4i32, + avx512vl_i32_info, HasAVX512>; + defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, v2i64, bc_v2i64, + avx512vl_i64_info, HasAVX512>, VEX_W; + defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, v8i16, bc_v8i16, + avx512vl_i16_info, HasBWI>; +} + +multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo> { + let Predicates = [HasAVX512] in + defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + VTInfo.info512>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + VTInfo.info512>, EVEX_V512; + let Predicates = [HasAVX512, HasVLX] in { + defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + VTInfo.info256>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + VTInfo.info256>, EVEX_V256; + defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + VTInfo.info128>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + VTInfo.info128>, EVEX_V128; + } +} + +multiclass avx512_shift_rmi_w<bits<8> opcw, + Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode> { + let Predicates = [HasBWI] in + defm WZ: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode, + v32i16_info>, EVEX_V512; + let Predicates = [HasVLX, HasBWI] in { + defm WZ256: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode, + v16i16x_info>, EVEX_V256; + defm WZ128: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode, + v8i16x_info>, EVEX_V128; + } +} + +multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq, + Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode> { + defm D: avx512_shift_rmi_sizes<opcd, ImmFormR, ImmFormM, OpcodeStr#"d", OpNode, + avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; + defm Q: avx512_shift_rmi_sizes<opcq, ImmFormR, ImmFormM, OpcodeStr#"q", OpNode, + avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W; +} + +defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli>, + avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli>, AVX512BIi8Base, EVEX_4V; + +defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli>, + avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli>, AVX512BIi8Base, EVEX_4V; + +defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai>, + avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai>, AVX512BIi8Base, EVEX_4V; + +defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", rotr>, AVX512BIi8Base, EVEX_4V; +defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", rotl>, AVX512BIi8Base, EVEX_4V; + +defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>; +defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>; +defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>; + +//===-------------------------------------------------------------------===// +// Variable Bit Shifts +//===-------------------------------------------------------------------===// +multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))), + SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V; + let mayLoad = 1 in + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, + (_.VT (bitconvert (_.LdFrag addr:$src2))))), + SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V, + EVEX_CD8<_.EltSize, CD8VF>; +} + +multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let mayLoad = 1 in + defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2))))), + SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; +} +multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in + defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512; + + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256; + defm Z128 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info128>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128; + } +} + +multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode, + avx512vl_i32_info>; + defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode, + avx512vl_i64_info>, VEX_W; +} + +// Use 512bit version to implement 128/256 bit in case NoVLX. +multiclass avx512_var_shift_w_lowering<AVX512VLVectorVTInfo _, SDNode OpNode> { + let Predicates = [HasBWI, NoVLX] in { + def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1), + (_.info256.VT _.info256.RC:$src2))), + (EXTRACT_SUBREG + (!cast<Instruction>(NAME#"WZrr") + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), + sub_ymm)>; + + def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1), + (_.info128.VT _.info128.RC:$src2))), + (EXTRACT_SUBREG + (!cast<Instruction>(NAME#"WZrr") + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), + sub_xmm)>; + } +} + +multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + let Predicates = [HasBWI] in + defm WZ: avx512_var_shift<opc, OpcodeStr, OpNode, v32i16_info>, + EVEX_V512, VEX_W; + let Predicates = [HasVLX, HasBWI] in { + + defm WZ256: avx512_var_shift<opc, OpcodeStr, OpNode, v16i16x_info>, + EVEX_V256, VEX_W; + defm WZ128: avx512_var_shift<opc, OpcodeStr, OpNode, v8i16x_info>, + EVEX_V128, VEX_W; + } +} + +defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>, + avx512_var_shift_w<0x12, "vpsllvw", shl>, + avx512_var_shift_w_lowering<avx512vl_i16_info, shl>; +defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>, + avx512_var_shift_w<0x11, "vpsravw", sra>, + avx512_var_shift_w_lowering<avx512vl_i16_info, sra>; +defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>, + avx512_var_shift_w<0x10, "vpsrlvw", srl>, + avx512_var_shift_w_lowering<avx512vl_i16_info, srl>; +defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>; +defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>; + +//===-------------------------------------------------------------------===// +// 1-src variable permutation VPERMW/D/Q +//===-------------------------------------------------------------------===// +multiclass avx512_vperm_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in + defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512; + + let Predicates = [HasAVX512, HasVLX] in + defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256; +} + +multiclass avx512_vpermi_dq_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo> { + let Predicates = [HasAVX512] in + defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + VTInfo.info512>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + VTInfo.info512>, EVEX_V512; + let Predicates = [HasAVX512, HasVLX] in + defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + VTInfo.info256>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + VTInfo.info256>, EVEX_V256; +} + + +defm VPERM : avx512_var_shift_w<0x8D, "vpermw", X86VPermv>; + +defm VPERMD : avx512_vperm_dq_sizes<0x36, "vpermd", X86VPermv, + avx512vl_i32_info>; +defm VPERMQ : avx512_vperm_dq_sizes<0x36, "vpermq", X86VPermv, + avx512vl_i64_info>, VEX_W; +defm VPERMPS : avx512_vperm_dq_sizes<0x16, "vpermps", X86VPermv, + avx512vl_f32_info>; +defm VPERMPD : avx512_vperm_dq_sizes<0x16, "vpermpd", X86VPermv, + avx512vl_f64_info>, VEX_W; + +defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq", + X86VPermi, avx512vl_i64_info>, + EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W; +defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd", + X86VPermi, avx512vl_f64_info>, + EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W; +//===----------------------------------------------------------------------===// +// AVX-512 - VPERMIL +//===----------------------------------------------------------------------===// + +multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, X86VectorVTInfo Ctrl> { + defm rr: AVX512_maskable<OpcVar, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, Ctrl.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, + (Ctrl.VT Ctrl.RC:$src2)))>, + T8PD, EVEX_4V; + let mayLoad = 1 in { + defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode + _.RC:$src1, + (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>, + T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (_.VT (OpNode + _.RC:$src1, + (Ctrl.VT (X86VBroadcast + (Ctrl.ScalarLdFrag addr:$src2)))))>, + T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + }//let mayLoad = 1 +} + +multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar, + AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{ + let Predicates = [HasAVX512] in { + defm Z : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info512, + Ctrl.info512>, EVEX_V512; + } + let Predicates = [HasAVX512, HasVLX] in { + defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info128, + Ctrl.info128>, EVEX_V128; + defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info256, + Ctrl.info256>, EVEX_V256; + } +} + +multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar, + AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{ + + defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, _, Ctrl>; + defm NAME: avx512_shift_rmi_sizes<OpcImm, MRMSrcReg, MRMSrcMem, OpcodeStr, + X86VPermilpi, _>, + EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>; +} + +defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info, + avx512vl_i32_info>; +defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info, + avx512vl_i64_info>, VEX_W; +//===----------------------------------------------------------------------===// +// AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW +//===----------------------------------------------------------------------===// + +defm VPSHUFD : avx512_shift_rmi_sizes<0x70, MRMSrcReg, MRMSrcMem, "vpshufd", + X86PShufd, avx512vl_i32_info>, + EVEX, AVX512BIi8Base, EVEX_CD8<32, CD8VF>; +defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw", + X86PShufhw>, EVEX, AVX512XSIi8Base; +defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw", + X86PShuflw>, EVEX, AVX512XDIi8Base; + +multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode> { + let Predicates = [HasBWI] in + defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, v64i8_info>, EVEX_V512; + + let Predicates = [HasVLX, HasBWI] in { + defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, v32i8x_info>, EVEX_V256; + defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, v16i8x_info>, EVEX_V128; + } +} + +defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb>; + +//===----------------------------------------------------------------------===// +// Move Low to High and High to Low packed FP Instructions +//===----------------------------------------------------------------------===// +def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2), + "vmovlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))], + IIC_SSE_MOV_LH>, EVEX_4V; +def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2), + "vmovhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))], + IIC_SSE_MOV_LH>, EVEX_4V; + +let Predicates = [HasAVX512] in { + // MOVLHPS patterns + def : Pat<(v4i32 (X86Movlhps VR128X:$src1, VR128X:$src2)), + (VMOVLHPSZrr VR128X:$src1, VR128X:$src2)>; + def : Pat<(v2i64 (X86Movlhps VR128X:$src1, VR128X:$src2)), + (VMOVLHPSZrr (v2i64 VR128X:$src1), VR128X:$src2)>; + + // MOVHLPS patterns + def : Pat<(v4i32 (X86Movhlps VR128X:$src1, VR128X:$src2)), + (VMOVHLPSZrr VR128X:$src1, VR128X:$src2)>; +} + +//===----------------------------------------------------------------------===// +// VMOVHPS/PD VMOVLPS Instructions +// All patterns was taken from SSS implementation. +//===----------------------------------------------------------------------===// +multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let mayLoad = 1 in + def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, f64mem:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, + (OpNode _.RC:$src1, + (_.VT (bitconvert + (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))], + IIC_SSE_MOV_LH>, EVEX_4V; +} + +defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", X86Movlhps, + v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS; +defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Movlhpd, + v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W; +defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", X86Movlps, + v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS; +defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movlpd, + v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W; + +let Predicates = [HasAVX512] in { + // VMOVHPS patterns + def : Pat<(X86Movlhps VR128X:$src1, + (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), + (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(X86Movlhps VR128X:$src1, + (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), + (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>; + // VMOVHPD patterns + def : Pat<(v2f64 (X86Unpckl VR128X:$src1, + (scalar_to_vector (loadf64 addr:$src2)))), + (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Unpckl VR128X:$src1, + (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), + (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>; + // VMOVLPS patterns + def : Pat<(v4f32 (X86Movlps VR128X:$src1, (load addr:$src2))), + (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(v4i32 (X86Movlps VR128X:$src1, (load addr:$src2))), + (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>; + // VMOVLPD patterns + def : Pat<(v2f64 (X86Movlpd VR128X:$src1, (load addr:$src2))), + (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(v2i64 (X86Movlpd VR128X:$src1, (load addr:$src2))), + (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movsd VR128X:$src1, + (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>; +} + +let mayStore = 1 in { +def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovhps\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract + (X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)), + (bc_v2f64 (v4f32 VR128X:$src))), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, + EVEX, EVEX_CD8<32, CD8VT2>; +def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovhpd\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract + (v2f64 (X86Unpckh VR128X:$src, VR128X:$src)), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, + EVEX, EVEX_CD8<64, CD8VT1>, VEX_W; +def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovlps\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128X:$src)), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>, + EVEX, EVEX_CD8<32, CD8VT2>; +def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovlpd\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (v2f64 VR128X:$src), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>, + EVEX, EVEX_CD8<64, CD8VT1>, VEX_W; +} +let Predicates = [HasAVX512] in { + // VMOVHPD patterns + def : Pat<(store (f64 (vector_extract + (v2f64 (X86VPermilpi VR128X:$src, (i8 1))), + (iPTR 0))), addr:$dst), + (VMOVHPDZ128mr addr:$dst, VR128X:$src)>; + // VMOVLPS patterns + def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128X:$src2)), + addr:$src1), + (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>; + def : Pat<(store (v4i32 (X86Movlps + (bc_v4i32 (loadv2i64 addr:$src1)), VR128X:$src2)), addr:$src1), + (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>; + // VMOVLPD patterns + def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128X:$src2)), + addr:$src1), + (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>; + def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128X:$src2)), + addr:$src1), + (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>; +} +//===----------------------------------------------------------------------===// +// FMA - Fused Multiply Operations +// + +let Constraints = "$src1 = $dst" in { +multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, + AVX512FMA3Base; + + let mayLoad = 1 in { + defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>, + AVX512FMA3Base; + + defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3), + OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), + !strconcat("$src2, ${src3}", _.BroadcastStr ), + (OpNode _.RC:$src1, + _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>, + AVX512FMA3Base, EVEX_B; + } +} + +multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), + OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", + (_.VT ( OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 imm:$rc)))>, + AVX512FMA3Base, EVEX_B, EVEX_RC; +} +} // Constraints = "$src1 = $dst" + +multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in { + defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info512>, + avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, _.info512>, + EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; + } + let Predicates = [HasVLX, HasAVX512] in { + defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info256>, + EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; + defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info128>, + EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>; + } +} + +multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd > { + defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd, + avx512vl_f32_info>; + defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd, + avx512vl_f64_info>, VEX_W; +} + +defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86Fmadd, X86FmaddRnd>; +defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86Fmsub, X86FmsubRnd>; +defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, X86FmaddsubRnd>; +defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd, X86FmsubaddRnd>; +defm VFNMADD213 : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86Fnmadd, X86FnmaddRnd>; +defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubRnd>; + + +let Constraints = "$src1 = $dst" in { +multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1))>, + AVX512FMA3Base; + + let mayLoad = 1 in { + defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>, + AVX512FMA3Base; + + defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3), + OpcodeStr, "${src3}"##_.BroadcastStr##", $src2", + "$src2, ${src3}"##_.BroadcastStr, + (_.VT (OpNode _.RC:$src2, + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + _.RC:$src1))>, AVX512FMA3Base, EVEX_B; + } +} + +multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), + OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", + (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc)))>, + AVX512FMA3Base, EVEX_B, EVEX_RC; +} +} // Constraints = "$src1 = $dst" + +multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in { + defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info512>, + avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, _.info512>, + EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; + } + let Predicates = [HasVLX, HasAVX512] in { + defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info256>, + EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; + defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info128>, + EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>; + } +} + +multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd > { + defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd, + avx512vl_f32_info>; + defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd, + avx512vl_f64_info>, VEX_W; +} + +defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86Fmadd, X86FmaddRnd>; +defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86Fmsub, X86FmsubRnd>; +defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, X86FmaddsubRnd>; +defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86FmsubaddRnd>; +defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86Fnmadd, X86FnmaddRnd>; +defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubRnd>; + +let Constraints = "$src1 = $dst" in { +multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src3, _.RC:$src2), + OpcodeStr, "$src2, $src3", "$src3, $src2", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, + AVX512FMA3Base; + + let mayLoad = 1 in { + defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src3, _.MemOp:$src2), + OpcodeStr, "$src2, $src3", "$src3, $src2", + (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2), _.RC:$src3))>, + AVX512FMA3Base; + + defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src3, _.ScalarMemOp:$src2), + OpcodeStr, "${src2}"##_.BroadcastStr##", $src3", + "$src3, ${src2}"##_.BroadcastStr, + (_.VT (OpNode _.RC:$src1, + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + _.RC:$src3))>, AVX512FMA3Base, EVEX_B; + } +} + +multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src3, _.RC:$src2, AVX512RC:$rc), + OpcodeStr, "$rc, $src2, $src3", "$src3, $src2, $rc", + (_.VT ( OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 imm:$rc)))>, + AVX512FMA3Base, EVEX_B, EVEX_RC; +} +} // Constraints = "$src1 = $dst" + +multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in { + defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info512>, + avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, _.info512>, + EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; + } + let Predicates = [HasVLX, HasAVX512] in { + defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info256>, + EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; + defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info128>, + EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>; + } +} + +multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd > { + defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd, + avx512vl_f32_info>; + defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd, + avx512vl_f64_info>, VEX_W; +} + +defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86Fmadd, X86FmaddRnd>; +defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86Fmsub, X86FmsubRnd>; +defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, X86FmaddsubRnd>; +defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd, X86FmsubaddRnd>; +defm VFNMADD132 : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86Fnmadd, X86FnmaddRnd>; +defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86Fnmsub, X86FnmsubRnd>; + +// Scalar FMA +let Constraints = "$src1 = $dst" in { +multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + dag RHS_VEC_r, dag RHS_VEC_m, dag RHS_VEC_rb, + dag RHS_r, dag RHS_m > { + defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3), OpcodeStr, + "$src3, $src2", "$src2, $src3", RHS_VEC_r>, AVX512FMA3Base; + + let mayLoad = 1 in + defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, + "$src3, $src2", "$src2, $src3", RHS_VEC_m>, AVX512FMA3Base; + + defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), + OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", RHS_VEC_rb>, + AVX512FMA3Base, EVEX_B, EVEX_RC; + + let isCodeGenOnly = 1 in { + def r : AVX512FMA3<opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [RHS_r]>; + let mayLoad = 1 in + def m : AVX512FMA3<opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [RHS_m]>; + }// isCodeGenOnly = 1 +} +}// Constraints = "$src1 = $dst" + +multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, + string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, X86VectorVTInfo _ , + string SUFF> { + + defm NAME#213#SUFF: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ , + (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), + (_.VT (OpNode _.RC:$src2, _.RC:$src1, + (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))))), + (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3, + (i32 imm:$rc))), + (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1, + _.FRC:$src3))), + (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1, + (_.ScalarLdFrag addr:$src3))))>; + + defm NAME#231#SUFF: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ , + (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), + (_.VT (OpNode _.RC:$src2, + (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), + _.RC:$src1)), + (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1, + (i32 imm:$rc))), + (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3, + _.FRC:$src1))), + (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, + (_.ScalarLdFrag addr:$src3), _.FRC:$src1)))>; + + defm NAME#132#SUFF: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ , + (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), + (_.VT (OpNode _.RC:$src1, + (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), + _.RC:$src2)), + (_.VT ( OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2, + (i32 imm:$rc))), + (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3, + _.FRC:$src2))), + (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, + (_.ScalarLdFrag addr:$src3), _.FRC:$src2)))>; +} + +multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132, + string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd>{ + let Predicates = [HasAVX512] in { + defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode, + OpNodeRnd, f32x_info, "SS">, + EVEX_CD8<32, CD8VT1>, VEX_LIG; + defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode, + OpNodeRnd, f64x_info, "SD">, + EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W; + } +} + +defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnd>; +defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>; +defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>; +defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>; + +//===----------------------------------------------------------------------===// +// AVX-512 Scalar convert from sign integer to float/double +//===----------------------------------------------------------------------===// + +multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, RegisterClass SrcRC, + X86VectorVTInfo DstVT, X86MemOperand x86memop, + PatFrag ld_frag, string asm> { + let hasSideEffects = 0 in { + def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst), + (ins DstVT.FRC:$src1, SrcRC:$src), + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + EVEX_4V; + let mayLoad = 1 in + def rm : SI<opc, MRMSrcMem, (outs DstVT.FRC:$dst), + (ins DstVT.FRC:$src1, x86memop:$src), + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + EVEX_4V; + } // hasSideEffects = 0 + let isCodeGenOnly = 1 in { + def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), + (ins DstVT.RC:$src1, SrcRC:$src2), + !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set DstVT.RC:$dst, + (OpNode (DstVT.VT DstVT.RC:$src1), + SrcRC:$src2, + (i32 FROUND_CURRENT)))]>, EVEX_4V; + + def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), + (ins DstVT.RC:$src1, x86memop:$src2), + !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set DstVT.RC:$dst, + (OpNode (DstVT.VT DstVT.RC:$src1), + (ld_frag addr:$src2), + (i32 FROUND_CURRENT)))]>, EVEX_4V; + }//isCodeGenOnly = 1 +} + +multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, RegisterClass SrcRC, + X86VectorVTInfo DstVT, string asm> { + def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), + (ins DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc), + !strconcat(asm, + "\t{$src2, $rc, $src1, $dst|$dst, $src1, $rc, $src2}"), + [(set DstVT.RC:$dst, + (OpNode (DstVT.VT DstVT.RC:$src1), + SrcRC:$src2, + (i32 imm:$rc)))]>, EVEX_4V, EVEX_B, EVEX_RC; +} + +multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode, RegisterClass SrcRC, + X86VectorVTInfo DstVT, X86MemOperand x86memop, + PatFrag ld_frag, string asm> { + defm NAME : avx512_vcvtsi_round<opc, OpNode, SrcRC, DstVT, asm>, + avx512_vcvtsi<opc, OpNode, SrcRC, DstVT, x86memop, ld_frag, asm>, + VEX_LIG; +} + +let Predicates = [HasAVX512] in { +defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32, + v4f32x_info, i32mem, loadi32, "cvtsi2ss{l}">, + XS, EVEX_CD8<32, CD8VT1>; +defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64, + v4f32x_info, i64mem, loadi64, "cvtsi2ss{q}">, + XS, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VCVTSI2SDZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32, + v2f64x_info, i32mem, loadi32, "cvtsi2sd{l}">, + XD, EVEX_CD8<32, CD8VT1>; +defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64, + v2f64x_info, i64mem, loadi64, "cvtsi2sd{q}">, + XD, VEX_W, EVEX_CD8<64, CD8VT1>; + +def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), + (VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), + (VCVTSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))), + (VCVTSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))), + (VCVTSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; + +def : Pat<(f32 (sint_to_fp GR32:$src)), + (VCVTSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>; +def : Pat<(f32 (sint_to_fp GR64:$src)), + (VCVTSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>; +def : Pat<(f64 (sint_to_fp GR32:$src)), + (VCVTSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>; +def : Pat<(f64 (sint_to_fp GR64:$src)), + (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>; + +defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR32, + v4f32x_info, i32mem, loadi32, + "cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>; +defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64, + v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">, + XS, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86UintToFpRnd, GR32, v2f64x_info, + i32mem, loadi32, "cvtusi2sd{l}">, + XD, VEX_LIG, EVEX_CD8<32, CD8VT1>; +defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64, + v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">, + XD, VEX_W, EVEX_CD8<64, CD8VT1>; + +def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))), + (VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(f32 (uint_to_fp (loadi64 addr:$src))), + (VCVTUSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(f64 (uint_to_fp (loadi32 addr:$src))), + (VCVTUSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(f64 (uint_to_fp (loadi64 addr:$src))), + (VCVTUSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; + +def : Pat<(f32 (uint_to_fp GR32:$src)), + (VCVTUSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>; +def : Pat<(f32 (uint_to_fp GR64:$src)), + (VCVTUSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>; +def : Pat<(f64 (uint_to_fp GR32:$src)), + (VCVTUSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>; +def : Pat<(f64 (uint_to_fp GR64:$src)), + (VCVTUSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>; +} + +//===----------------------------------------------------------------------===// +// AVX-512 Scalar convert from float/double to integer +//===----------------------------------------------------------------------===// +multiclass avx512_cvt_s_int_round<bits<8> opc, RegisterClass SrcRC, + RegisterClass DstRC, Intrinsic Int, + Operand memop, ComplexPattern mem_cpat, string asm> { + let hasSideEffects = 0, Predicates = [HasAVX512] in { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG; + def rb : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc), + !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), []>, + EVEX, VEX_LIG, EVEX_B, EVEX_RC; + let mayLoad = 1 in + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG; + } // hasSideEffects = 0, Predicates = [HasAVX512] +} + +// Convert float/double to signed/unsigned int 32/64 +defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse_cvtss2si, + ssmem, sse_load_f32, "cvtss2si">, + XS, EVEX_CD8<32, CD8VT1>; +defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, + int_x86_sse_cvtss2si64, + ssmem, sse_load_f32, "cvtss2si">, + XS, VEX_W, EVEX_CD8<32, CD8VT1>; +defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, + int_x86_avx512_cvtss2usi, + ssmem, sse_load_f32, "cvtss2usi">, + XS, EVEX_CD8<32, CD8VT1>; +defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, VR128X, GR64, + int_x86_avx512_cvtss2usi64, ssmem, + sse_load_f32, "cvtss2usi">, XS, VEX_W, + EVEX_CD8<32, CD8VT1>; +defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si, + sdmem, sse_load_f64, "cvtsd2si">, + XD, EVEX_CD8<64, CD8VT1>; +defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, + int_x86_sse2_cvtsd2si64, + sdmem, sse_load_f64, "cvtsd2si">, + XD, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, + int_x86_avx512_cvtsd2usi, + sdmem, sse_load_f64, "cvtsd2usi">, + XD, EVEX_CD8<64, CD8VT1>; +defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, VR128X, GR64, + int_x86_avx512_cvtsd2usi64, sdmem, + sse_load_f64, "cvtsd2usi">, XD, VEX_W, + EVEX_CD8<64, CD8VT1>; + +let isCodeGenOnly = 1 , Predicates = [HasAVX512] in { + defm Int_VCVTSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X, + int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}", + SSE_CVT_Scalar, 0>, XS, EVEX_4V; + defm Int_VCVTSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X, + int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}", + SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W; + defm Int_VCVTSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X, + int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}", + SSE_CVT_Scalar, 0>, XD, EVEX_4V; + defm Int_VCVTSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X, + int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}", + SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W; + + defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X, + int_x86_avx512_cvtusi2sd, i32mem, loadi32, "cvtusi2sd{l}", + SSE_CVT_Scalar, 0>, XD, EVEX_4V; +} // isCodeGenOnly = 1, Predicates = [HasAVX512] + +// Convert float/double to signed/unsigned int 32/64 with truncation +multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC, + X86VectorVTInfo _DstRC, SDNode OpNode, + SDNode OpNodeRnd>{ +let Predicates = [HasAVX512] in { + def rr : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>, EVEX; + def rb : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src), + !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"), + []>, EVEX, EVEX_B; + def rm : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.MemOp:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>, + EVEX; + + let isCodeGenOnly = 1,hasSideEffects = 0 in { + def rr_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src, + (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG; + def rb_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), + !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"), + [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src, + (i32 FROUND_NO_EXC)))]>, + EVEX,VEX_LIG , EVEX_B; + let mayLoad = 1 in + def rm_Int : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), + (ins _SrcRC.MemOp:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + []>, EVEX, VEX_LIG; + + } // isCodeGenOnly = 1, hasSideEffects = 0 +} //HasAVX512 +} + + +defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i32x_info, + fp_to_sint,X86cvttss2IntRnd>, + XS, EVEX_CD8<32, CD8VT1>; +defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i64x_info, + fp_to_sint,X86cvttss2IntRnd>, + VEX_W, XS, EVEX_CD8<32, CD8VT1>; +defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i32x_info, + fp_to_sint,X86cvttsd2IntRnd>, + XD, EVEX_CD8<64, CD8VT1>; +defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i64x_info, + fp_to_sint,X86cvttsd2IntRnd>, + VEX_W, XD, EVEX_CD8<64, CD8VT1>; + +defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i32x_info, + fp_to_uint,X86cvttss2UIntRnd>, + XS, EVEX_CD8<32, CD8VT1>; +defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i64x_info, + fp_to_uint,X86cvttss2UIntRnd>, + XS,VEX_W, EVEX_CD8<32, CD8VT1>; +defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i32x_info, + fp_to_uint,X86cvttsd2UIntRnd>, + XD, EVEX_CD8<64, CD8VT1>; +defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i64x_info, + fp_to_uint,X86cvttsd2UIntRnd>, + XD, VEX_W, EVEX_CD8<64, CD8VT1>; +let Predicates = [HasAVX512] in { + def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))), + (VCVTTSS2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + def : Pat<(i64 (int_x86_sse_cvttss2si64 (v4f32 VR128X:$src))), + (VCVTTSS2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + def : Pat<(i32 (int_x86_sse2_cvttsd2si (v2f64 VR128X:$src))), + (VCVTTSD2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>; + def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (v2f64 VR128X:$src))), + (VCVTTSD2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>; + +} // HasAVX512 +//===----------------------------------------------------------------------===// +// AVX-512 Convert form float to double and back +//===----------------------------------------------------------------------===// +multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNode> { + defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode (_Src.VT _Src.RC:$src1), + (_Src.VT _Src.RC:$src2)))>, + EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; + defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode (_Src.VT _Src.RC:$src1), + (_Src.VT (scalar_to_vector + (_Src.ScalarLdFrag addr:$src2)))))>, + EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>; +} + +// Scalar Coversion with SAE - suppress all exceptions +multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNodeRnd> { + defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, + "{sae}, $src2, $src1", "$src1, $src2, {sae}", + (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src1), + (_Src.VT _Src.RC:$src2), + (i32 FROUND_NO_EXC)))>, + EVEX_4V, VEX_LIG, EVEX_B; +} + +// Scalar Conversion with rounding control (RC) +multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNodeRnd> { + defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr, + "$rc, $src2, $src1", "$src1, $src2, $rc", + (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src1), + (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>, + EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, + EVEX_B, EVEX_RC; +} +multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86VectorVTInfo _src, + X86VectorVTInfo _dst> { + let Predicates = [HasAVX512] in { + defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>, + avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src, + OpNodeRnd>, VEX_W, EVEX_CD8<64, CD8VT1>, + EVEX_V512, XD; + } +} + +multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86VectorVTInfo _src, + X86VectorVTInfo _dst> { + let Predicates = [HasAVX512] in { + defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>, + avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>, + EVEX_CD8<32, CD8VT1>, XS, EVEX_V512; + } +} +defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86fround, + X86froundRnd, f64x_info, f32x_info>; +defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext, + X86fpextRnd,f32x_info, f64x_info >; + +def : Pat<(f64 (fextend FR32X:$src)), + (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X), + (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>, + Requires<[HasAVX512]>; +def : Pat<(f64 (fextend (loadf32 addr:$src))), + (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, + Requires<[HasAVX512]>; + +def : Pat<(f64 (extloadf32 addr:$src)), + (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, + Requires<[HasAVX512, OptForSize]>; + +def : Pat<(f64 (extloadf32 addr:$src)), + (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>, + Requires<[HasAVX512, OptForSpeed]>; + +def : Pat<(f32 (fround FR64X:$src)), + (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), + (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>, + Requires<[HasAVX512]>; +//===----------------------------------------------------------------------===// +// AVX-512 Vector convert from signed/unsigned integer to float/double +// and from float/double to signed/unsigned integer +//===----------------------------------------------------------------------===// + +multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNode, + string Broadcast = _.BroadcastStr, + string Alias = ""> { + + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src), OpcodeStr, "$src", "$src", + (_.VT (OpNode (_Src.VT _Src.RC:$src)))>, EVEX; + + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _Src.MemOp:$src), OpcodeStr#Alias, "$src", "$src", + (_.VT (OpNode (_Src.VT + (bitconvert (_Src.LdFrag addr:$src)))))>, EVEX; + + defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _Src.MemOp:$src), OpcodeStr, + "${src}"##Broadcast, "${src}"##Broadcast, + (_.VT (OpNode (_Src.VT + (X86VBroadcast (_Src.ScalarLdFrag addr:$src))) + ))>, EVEX, EVEX_B; +} +// Coversion with SAE - suppress all exceptions +multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNodeRnd> { + defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src), OpcodeStr, + "{sae}, $src", "$src, {sae}", + (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), + (i32 FROUND_NO_EXC)))>, + EVEX, EVEX_B; +} + +// Conversion with rounding control (RC) +multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNodeRnd> { + defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr, + "$rc, $src", "$src, $rc", + (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc)))>, + EVEX, EVEX_B, EVEX_RC; +} + +// Extend Float to Double +multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, fextend>, + avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info, + X86vfpextRnd>, EVEX_V512; + } + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info, + X86vfpext, "{1to2}">, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fextend>, + EVEX_V256; + } +} + +// Truncate Double to Float +multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fround>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info, + X86vfproundRnd>, EVEX_V512; + } + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info, + X86vfpround, "{1to2}", "{x}">, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fround, + "{1to4}", "{y}">, EVEX_V256; + } +} + +defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps">, + VEX_W, PD, EVEX_CD8<64, CD8VF>; +defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd">, + PS, EVEX_CD8<32, CD8VH>; + +def : Pat<(v8f64 (extloadv8f32 addr:$src)), + (VCVTPS2PDZrm addr:$src)>; + +let Predicates = [HasVLX] in { + def : Pat<(v4f64 (extloadv4f32 addr:$src)), + (VCVTPS2PDZ256rm addr:$src)>; +} + +// Convert Signed/Unsigned Doubleword to Double +multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNode128> { + // No rounding in this op + let Predicates = [HasAVX512] in + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode>, + EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info, + OpNode128, "{1to2}">, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode>, + EVEX_V256; + } +} + +// Convert Signed/Unsigned Doubleword to Float +multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd> { + let Predicates = [HasAVX512] in + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v16f32_info, v16i32_info, + OpNodeRnd>, EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode>, + EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode>, + EVEX_V256; + } +} + +// Convert Float to Signed/Unsigned Doubleword with truncation +multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>, + avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>, + EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>, + EVEX_V256; + } +} + +// Convert Float to Signed/Unsigned Doubleword +multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f32_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>, + EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>, + EVEX_V256; + } +} + +// Convert Double to Signed/Unsigned Doubleword with truncation +multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>, + avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasVLX] in { + // we need "x"/"y" suffixes in order to distinguish between 128 and 256 + // memory forms of these instructions in Asm Parcer. They have the same + // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly + // due to the same reason. + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode, + "{1to2}", "{x}">, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode, + "{1to4}", "{y}">, EVEX_V256; + } +} + +// Convert Double to Signed/Unsigned Doubleword +multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v8i32x_info, v8f64_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasVLX] in { + // we need "x"/"y" suffixes in order to distinguish between 128 and 256 + // memory forms of these instructions in Asm Parcer. They have the same + // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly + // due to the same reason. + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode, + "{1to2}", "{x}">, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode, + "{1to4}", "{y}">, EVEX_V256; + } +} + +// Convert Double to Signed/Unsigned Quardword +multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasDQI] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f64_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasDQI, HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>, + EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>, + EVEX_V256; + } +} + +// Convert Double to Signed/Unsigned Quardword with truncation +multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasDQI] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>, + avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasDQI, HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>, + EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>, + EVEX_V256; + } +} + +// Convert Signed/Unsigned Quardword to Double +multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasDQI] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v8f64_info, v8i64_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasDQI, HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode>, + EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode>, + EVEX_V256; + } +} + +// Convert Float to Signed/Unsigned Quardword +multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasDQI] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f32x_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasDQI, HasVLX] in { + // Explicitly specified broadcast string, since we take only 2 elements + // from v4f32x_info source + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode, + "{1to2}">, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>, + EVEX_V256; + } +} + +// Convert Float to Signed/Unsigned Quardword with truncation +multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasDQI] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>, + avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasDQI, HasVLX] in { + // Explicitly specified broadcast string, since we take only 2 elements + // from v4f32x_info source + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode, + "{1to2}">, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>, + EVEX_V256; + } +} + +// Convert Signed/Unsigned Quardword to Float +multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasDQI] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info, + OpNodeRnd>, EVEX_V512; + } + let Predicates = [HasDQI, HasVLX] in { + // we need "x"/"y" suffixes in order to distinguish between 128 and 256 + // memory forms of these instructions in Asm Parcer. They have the same + // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly + // due to the same reason. + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode, + "{1to2}", "{x}">, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode, + "{1to4}", "{y}">, EVEX_V256; + } +} + +defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86cvtdq2pd>, XS, + EVEX_CD8<32, CD8VH>; + +defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp, + X86VSintToFpRnd>, + PS, EVEX_CD8<32, CD8VF>; + +defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", fp_to_sint, + X86VFpToSintRnd>, + XS, EVEX_CD8<32, CD8VF>; + +defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint, + X86VFpToSintRnd>, + PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", fp_to_uint, + X86VFpToUintRnd>, PS, + EVEX_CD8<32, CD8VF>; + +defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint, + X86VFpToUintRnd>, PS, VEX_W, + EVEX_CD8<64, CD8VF>; + +defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, X86cvtudq2pd>, + XS, EVEX_CD8<32, CD8VH>; + +defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp, + X86VUintToFpRnd>, XD, + EVEX_CD8<32, CD8VF>; + +defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtps2Int, + X86cvtps2IntRnd>, PD, EVEX_CD8<32, CD8VF>; + +defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtpd2Int, + X86cvtpd2IntRnd>, XD, VEX_W, + EVEX_CD8<64, CD8VF>; + +defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtps2UInt, + X86cvtps2UIntRnd>, + PS, EVEX_CD8<32, CD8VF>; +defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtpd2UInt, + X86cvtpd2UIntRnd>, VEX_W, + PS, EVEX_CD8<64, CD8VF>; + +defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtpd2Int, + X86cvtpd2IntRnd>, VEX_W, + PD, EVEX_CD8<64, CD8VF>; + +defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtps2Int, + X86cvtps2IntRnd>, PD, EVEX_CD8<32, CD8VH>; + +defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtpd2UInt, + X86cvtpd2UIntRnd>, VEX_W, + PD, EVEX_CD8<64, CD8VF>; + +defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtps2UInt, + X86cvtps2UIntRnd>, PD, EVEX_CD8<32, CD8VH>; + +defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", fp_to_sint, + X86VFpToSlongRnd>, VEX_W, + PD, EVEX_CD8<64, CD8VF>; + +defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", fp_to_sint, + X86VFpToSlongRnd>, PD, EVEX_CD8<32, CD8VH>; + +defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", fp_to_uint, + X86VFpToUlongRnd>, VEX_W, + PD, EVEX_CD8<64, CD8VF>; + +defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", fp_to_uint, + X86VFpToUlongRnd>, PD, EVEX_CD8<32, CD8VH>; + +defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp, + X86VSlongToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>; + +defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp, + X86VUlongToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>; + +defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, + X86VSlongToFpRnd>, VEX_W, PS, EVEX_CD8<64, CD8VF>; + +defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, + X86VUlongToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>; + +let Predicates = [HasAVX512, NoVLX] in { +def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))), + (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr + (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; + +def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))), + (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr + (v16f32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>; + +def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))), + (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr + (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; + +def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))), + (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr + (v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>; + +def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))), + (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr + (v8i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_ymm)>; +} + +let Predicates = [HasAVX512] in { + def : Pat<(v8f32 (fround (loadv8f64 addr:$src))), + (VCVTPD2PSZrm addr:$src)>; + def : Pat<(v8f64 (extloadv8f32 addr:$src)), + (VCVTPS2PDZrm addr:$src)>; +} + +//===----------------------------------------------------------------------===// +// Half precision conversion instructions +//===----------------------------------------------------------------------===// +multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src, + X86MemOperand x86memop, PatFrag ld_frag> { + defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src), + "vcvtph2ps", "$src", "$src", + (X86cvtph2ps (_src.VT _src.RC:$src), + (i32 FROUND_CURRENT))>, T8PD; + let hasSideEffects = 0, mayLoad = 1 in { + defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src), + "vcvtph2ps", "$src", "$src", + (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))), + (i32 FROUND_CURRENT))>, T8PD; + } +} + +multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> { + defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src), + "vcvtph2ps", "{sae}, $src", "$src, {sae}", + (X86cvtph2ps (_src.VT _src.RC:$src), + (i32 FROUND_NO_EXC))>, T8PD, EVEX_B; + +} + +let Predicates = [HasAVX512] in { + defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64>, + avx512_cvtph2ps_sae<v16f32_info, v16i16x_info>, + EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; + let Predicates = [HasVLX] in { + defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem, + loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; + defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem, + loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; + } +} + +multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src, + X86MemOperand x86memop> { + defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), + (ins _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph", "$src2, $src1", "$src1, $src2", + (X86cvtps2ph (_src.VT _src.RC:$src1), + (i32 imm:$src2), + (i32 FROUND_CURRENT))>, AVX512AIi8Base; + let hasSideEffects = 0, mayStore = 1 in { + def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), + (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1), + (i32 imm:$src2), (i32 FROUND_CURRENT) )), + addr:$dst)]>; + def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs), + (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + []>, EVEX_K; + } +} +multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> { + defm rb : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), + (ins _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph", "$src2, {sae}, $src1", "$src1, $src2, {sae}", + (X86cvtps2ph (_src.VT _src.RC:$src1), + (i32 imm:$src2), + (i32 FROUND_NO_EXC))>, EVEX_B, AVX512AIi8Base; +} +let Predicates = [HasAVX512] in { + defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem>, + avx512_cvtps2ph_sae<v16i16x_info, v16f32_info>, + EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; + let Predicates = [HasVLX] in { + defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem>, + EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; + defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f128mem>, + EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; + } +} + +// Unordered/Ordered scalar fp compare with Sea and set EFLAGS +multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _, SDNode OpNode, + string OpcodeStr> { + def rb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"), + [(set EFLAGS, (OpNode (_.VT _.RC:$src1), _.RC:$src2, + (i32 FROUND_NO_EXC)))], + IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128, + Sched<[WriteFAdd]>; +} + +let Defs = [EFLAGS], Predicates = [HasAVX512] in { + defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, X86ucomiSae, "vucomiss">, + AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; + defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, X86ucomiSae, "vucomisd">, + AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; + defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, X86comiSae, "vcomiss">, + AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; + defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, X86comiSae, "vcomisd">, + AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; +} + +let Defs = [EFLAGS], Predicates = [HasAVX512] in { + defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32, + "ucomiss">, PS, EVEX, VEX_LIG, + EVEX_CD8<32, CD8VT1>; + defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64, + "ucomisd">, PD, EVEX, + VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + let Pattern = []<dag> in { + defm VCOMISSZ : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32, + "comiss">, PS, EVEX, VEX_LIG, + EVEX_CD8<32, CD8VT1>; + defm VCOMISDZ : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64, + "comisd">, PD, EVEX, + VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + } + let isCodeGenOnly = 1 in { + defm Int_VUCOMISSZ : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v4f32, f128mem, + load, "ucomiss">, PS, EVEX, VEX_LIG, + EVEX_CD8<32, CD8VT1>; + defm Int_VUCOMISDZ : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v2f64, f128mem, + load, "ucomisd">, PD, EVEX, + VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + + defm Int_VCOMISSZ : sse12_ord_cmp<0x2F, VR128X, X86comi, v4f32, f128mem, + load, "comiss">, PS, EVEX, VEX_LIG, + EVEX_CD8<32, CD8VT1>; + defm Int_VCOMISDZ : sse12_ord_cmp<0x2F, VR128X, X86comi, v2f64, f128mem, + load, "comisd">, PD, EVEX, + VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + } +} + +/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd +multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let hasSideEffects = 0, AddedComplexity = 20 , Predicates = [HasAVX512] in { + defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, EVEX_4V; + let mayLoad = 1 in { + defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))))>, EVEX_4V; + } +} +} + +defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86frcp14s, f32x_info>, + EVEX_CD8<32, CD8VT1>, T8PD; +defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86frcp14s, f64x_info>, + VEX_W, EVEX_CD8<64, CD8VT1>, T8PD; +defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86frsqrt14s, f32x_info>, + EVEX_CD8<32, CD8VT1>, T8PD; +defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>, + VEX_W, EVEX_CD8<64, CD8VT1>, T8PD; + +/// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd +multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, "$src", "$src", + (_.FloatVT (OpNode _.RC:$src))>, EVEX, T8PD; + let mayLoad = 1 in { + defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src), OpcodeStr, "$src", "$src", + (OpNode (_.FloatVT + (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8PD; + defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), OpcodeStr, + "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, + (OpNode (_.FloatVT + (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + EVEX, T8PD, EVEX_B; + } +} + +multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, v16f32_info>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, v8f64_info>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), + OpNode, v4f32x_info>, + EVEX_V128, EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), + OpNode, v8f32x_info>, + EVEX_V256, EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), + OpNode, v2f64x_info>, + EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), + OpNode, v4f64x_info>, + EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>; + } +} + +defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>; +defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>; + +/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd +multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + SDNode OpNode> { + + defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 FROUND_CURRENT))>; + + defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "{sae}, $src2, $src1", "$src1, $src2, {sae}", + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 FROUND_NO_EXC))>, EVEX_B; + + defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), + (i32 FROUND_CURRENT))>; +} + +multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm SS : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode>, + EVEX_CD8<32, CD8VT1>; + defm SD : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode>, + EVEX_CD8<64, CD8VT1>, VEX_W; +} + +let hasSideEffects = 0, Predicates = [HasERI] in { + defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s>, T8PD, EVEX_4V; + defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V; +} + +defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds>, T8PD, EVEX_4V; +/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd + +multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + SDNode OpNode> { + + defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, "$src", "$src", + (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>; + + defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src), OpcodeStr, "$src", "$src", + (OpNode (_.FloatVT + (bitconvert (_.LdFrag addr:$src))), + (i32 FROUND_CURRENT))>; + + defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src), OpcodeStr, + "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, + (OpNode (_.FloatVT + (X86VBroadcast (_.ScalarLdFrag addr:$src))), + (i32 FROUND_CURRENT))>, EVEX_B; +} +multiclass avx512_fp28_p_round<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + SDNode OpNode> { + defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, + "{sae}, $src", "$src, {sae}", + (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>, EVEX_B; +} + +multiclass avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm PS : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode>, + avx512_fp28_p_round<opc, OpcodeStr#"ps", v16f32_info, OpNode>, + T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; + defm PD : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode>, + avx512_fp28_p_round<opc, OpcodeStr#"pd", v8f64_info, OpNode>, + T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +} + +multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode>, + EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode>, + EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode>, + EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode>, + EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>; + } +} +let Predicates = [HasERI], hasSideEffects = 0 in { + + defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX; + defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28>, EVEX; + defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2>, EVEX; +} +defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexpRnd>, + avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexpRnd> , EVEX; + +multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr, + SDNode OpNodeRnd, X86VectorVTInfo _>{ + defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc", + (_.VT (OpNodeRnd _.RC:$src, (i32 imm:$rc)))>, + EVEX, EVEX_B, EVEX_RC; +} + +multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86VectorVTInfo _>{ + defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, "$src", "$src", + (_.FloatVT (OpNode _.RC:$src))>, EVEX; + let mayLoad = 1 in { + defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src), OpcodeStr, "$src", "$src", + (OpNode (_.FloatVT + (bitconvert (_.LdFrag addr:$src))))>, EVEX; + + defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), OpcodeStr, + "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, + (OpNode (_.FloatVT + (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + EVEX, EVEX_B; + } +} + +multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, + v16f32_info>, + EVEX_V512, PS, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, + v8f64_info>, + EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>; + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), + OpNode, v4f32x_info>, + EVEX_V128, PS, EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), + OpNode, v8f32x_info>, + EVEX_V256, PS, EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), + OpNode, v2f64x_info>, + EVEX_V128, VEX_W, PD, EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), + OpNode, v4f64x_info>, + EVEX_V256, VEX_W, PD, EVEX_CD8<64, CD8VF>; + } +} + +multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr, + SDNode OpNodeRnd> { + defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"), OpNodeRnd, + v16f32_info>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"), OpNodeRnd, + v8f64_info>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>; +} + +multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + string SUFF, SDNode OpNode, SDNode OpNodeRnd> { + + defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNodeRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 FROUND_CURRENT))>; + let mayLoad = 1 in + defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNodeRnd (_.VT _.RC:$src1), + (_.VT (scalar_to_vector + (_.ScalarLdFrag addr:$src2))), + (i32 FROUND_CURRENT))>; + + defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, + "$rc, $src2, $src1", "$src1, $src2, $rc", + (OpNodeRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 imm:$rc))>, + EVEX_B, EVEX_RC; + + let isCodeGenOnly = 1 in { + def r : I<opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>; + + let mayLoad = 1 in + def m : I<opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>; + } + + def : Pat<(_.EltVT (OpNode _.FRC:$src)), + (!cast<Instruction>(NAME#SUFF#Zr) + (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>; + + def : Pat<(_.EltVT (OpNode (load addr:$src))), + (!cast<Instruction>(NAME#SUFF#Zm) + (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[OptForSize]>; +} + +multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> { + defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", f32x_info, "SS", fsqrt, + X86fsqrtRnds>, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS; + defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", f64x_info, "SD", fsqrt, + X86fsqrtRnds>, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W; +} + +defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>, + avx512_sqrt_packed_all_round<0x51, "vsqrt", X86fsqrtRnd>; + +defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG; + +let Predicates = [HasAVX512] in { + def : Pat<(f32 (X86frsqrt FR32X:$src)), + (COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>; + def : Pat<(f32 (X86frsqrt (load addr:$src))), + (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, + Requires<[OptForSize]>; + def : Pat<(f32 (X86frcp FR32X:$src)), + (COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>; + def : Pat<(f32 (X86frcp (load addr:$src))), + (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, + Requires<[OptForSize]>; +} + +multiclass +avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { + + let ExeDomain = _.ExeDomain in { + defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 imm:$src3), (i32 FROUND_CURRENT)))>; + + defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, + "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3", + (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B; + + let mayLoad = 1 in + defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (_.VT (X86RndScales (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), + (i32 imm:$src3), (i32 FROUND_CURRENT)))>; + } + let Predicates = [HasAVX512] in { + def : Pat<(ffloor _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x1))), _.FRC)>; + def : Pat<(fceil _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x2))), _.FRC)>; + def : Pat<(ftrunc _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x3))), _.FRC)>; + def : Pat<(frint _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x4))), _.FRC)>; + def : Pat<(fnearbyint _.FRC:$src), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xc))), _.FRC)>; + + def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0x1))), _.FRC)>; + def : Pat<(fceil (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0x2))), _.FRC)>; + def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0x3))), _.FRC)>; + def : Pat<(frint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0x4))), _.FRC)>; + def : Pat<(fnearbyint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS + (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), + addr:$src, (i32 0xc))), _.FRC)>; + } +} + +defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", f32x_info>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VT1>; + +defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VT1>; + +//------------------------------------------------- +// Integer truncate and extend operations +//------------------------------------------------- + +multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo SrcInfo, X86VectorVTInfo DestInfo, + X86MemOperand x86memop> { + + defm rr : AVX512_maskable<opc, MRMDestReg, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src1), OpcodeStr ,"$src1", "$src1", + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1)))>, + EVEX, T8XS; + + // for intrinsic patter match + def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + undef)), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask , + SrcInfo.RC:$src1)>; + + def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + DestInfo.ImmAllZerosV)), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask , + SrcInfo.RC:$src1)>; + + def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + DestInfo.RC:$src0)), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrk) DestInfo.RC:$src0, + DestInfo.KRCWM:$mask , + SrcInfo.RC:$src1)>; + + let mayStore = 1 in { + def mr : AVX512XS8I<opc, MRMDestMem, (outs), + (ins x86memop:$dst, SrcInfo.RC:$src), + OpcodeStr # "\t{$src, $dst |$dst, $src}", + []>, EVEX; + + def mrk : AVX512XS8I<opc, MRMDestMem, (outs), + (ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src), + OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", + []>, EVEX, EVEX_K; + }//mayStore = 1 +} + +multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo, + X86VectorVTInfo DestInfo, + PatFrag truncFrag, PatFrag mtruncFrag > { + + def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr) + addr:$dst, SrcInfo.RC:$src)>; + + def : Pat<(mtruncFrag addr:$dst, SrcInfo.KRCWM:$mask, + (SrcInfo.VT SrcInfo.RC:$src)), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk) + addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>; +} + +multiclass avx512_trunc_sat_mr_lowering<X86VectorVTInfo SrcInfo, + X86VectorVTInfo DestInfo, string sat > { + + def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix# + DestInfo.Suffix#"_mem_"#SrcInfo.Size) + addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), SrcInfo.MRC:$mask), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk) addr:$ptr, + (COPY_TO_REGCLASS SrcInfo.MRC:$mask, SrcInfo.KRCWM), + (SrcInfo.VT SrcInfo.RC:$src))>; + + def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix# + DestInfo.Suffix#"_mem_"#SrcInfo.Size) + addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), -1), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr) addr:$ptr, + (SrcInfo.VT SrcInfo.RC:$src))>; +} + +multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128, + X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ, + X86MemOperand x86memopZ128, X86MemOperand x86memopZ256, + X86MemOperand x86memopZ, PatFrag truncFrag, PatFrag mtruncFrag, + Predicate prd = HasAVX512>{ + + let Predicates = [HasVLX, prd] in { + defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128, + DestInfoZ128, x86memopZ128>, + avx512_trunc_mr_lowering<VTSrcInfo.info128, DestInfoZ128, + truncFrag, mtruncFrag>, EVEX_V128; + + defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256, + DestInfoZ256, x86memopZ256>, + avx512_trunc_mr_lowering<VTSrcInfo.info256, DestInfoZ256, + truncFrag, mtruncFrag>, EVEX_V256; + } + let Predicates = [prd] in + defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512, + DestInfoZ, x86memopZ>, + avx512_trunc_mr_lowering<VTSrcInfo.info512, DestInfoZ, + truncFrag, mtruncFrag>, EVEX_V512; +} + +multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128, + X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ, + X86MemOperand x86memopZ128, X86MemOperand x86memopZ256, + X86MemOperand x86memopZ, string sat, Predicate prd = HasAVX512>{ + + let Predicates = [HasVLX, prd] in { + defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128, + DestInfoZ128, x86memopZ128>, + avx512_trunc_sat_mr_lowering<VTSrcInfo.info128, DestInfoZ128, + sat>, EVEX_V128; + + defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256, + DestInfoZ256, x86memopZ256>, + avx512_trunc_sat_mr_lowering<VTSrcInfo.info256, DestInfoZ256, + sat>, EVEX_V256; + } + let Predicates = [prd] in + defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512, + DestInfoZ, x86memopZ>, + avx512_trunc_sat_mr_lowering<VTSrcInfo.info512, DestInfoZ, + sat>, EVEX_V512; +} + +multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info, + v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem, + truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VO>; +} +multiclass avx512_trunc_sat_qb<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qb", OpNode, avx512vl_i64_info, + v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem, + sat>, EVEX_CD8<8, CD8VO>; +} + +multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info, + v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem, + truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VQ>; +} +multiclass avx512_trunc_sat_qw<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qw", OpNode, avx512vl_i64_info, + v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem, + sat>, EVEX_CD8<16, CD8VQ>; +} + +multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info, + v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem, + truncstorevi32, masked_truncstorevi32>, EVEX_CD8<32, CD8VH>; +} +multiclass avx512_trunc_sat_qd<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qd", OpNode, avx512vl_i64_info, + v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem, + sat>, EVEX_CD8<32, CD8VH>; +} + +multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info, + v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem, + truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VQ>; +} +multiclass avx512_trunc_sat_db<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"db", OpNode, avx512vl_i32_info, + v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem, + sat>, EVEX_CD8<8, CD8VQ>; +} + +multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info, + v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem, + truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VH>; +} +multiclass avx512_trunc_sat_dw<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"dw", OpNode, avx512vl_i32_info, + v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem, + sat>, EVEX_CD8<16, CD8VH>; +} + +multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i16_info, + v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem, + truncstorevi8, masked_truncstorevi8,HasBWI>, EVEX_CD8<16, CD8VH>; +} +multiclass avx512_trunc_sat_wb<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"wb", OpNode, avx512vl_i16_info, + v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem, + sat, HasBWI>, EVEX_CD8<16, CD8VH>; +} + +defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc>; +defm VPMOVSQB : avx512_trunc_sat_qb<0x22, "s", X86vtruncs>; +defm VPMOVUSQB : avx512_trunc_sat_qb<0x12, "us", X86vtruncus>; + +defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc>; +defm VPMOVSQW : avx512_trunc_sat_qw<0x24, "s", X86vtruncs>; +defm VPMOVUSQW : avx512_trunc_sat_qw<0x14, "us", X86vtruncus>; + +defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc>; +defm VPMOVSQD : avx512_trunc_sat_qd<0x25, "s", X86vtruncs>; +defm VPMOVUSQD : avx512_trunc_sat_qd<0x15, "us", X86vtruncus>; + +defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc>; +defm VPMOVSDB : avx512_trunc_sat_db<0x21, "s", X86vtruncs>; +defm VPMOVUSDB : avx512_trunc_sat_db<0x11, "us", X86vtruncus>; + +defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc>; +defm VPMOVSDW : avx512_trunc_sat_dw<0x23, "s", X86vtruncs>; +defm VPMOVUSDW : avx512_trunc_sat_dw<0x13, "us", X86vtruncus>; + +defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc>; +defm VPMOVSWB : avx512_trunc_sat_wb<0x20, "s", X86vtruncs>; +defm VPMOVUSWB : avx512_trunc_sat_wb<0x10, "us", X86vtruncus>; + +let Predicates = [HasAVX512, NoVLX] in { +def: Pat<(v8i16 (X86vtrunc (v8i32 VR256X:$src))), + (v8i16 (EXTRACT_SUBREG + (v16i16 (VPMOVDWZrr (v16i32 (SUBREG_TO_REG (i32 0), + VR256X:$src, sub_ymm)))), sub_xmm))>; +def: Pat<(v4i32 (X86vtrunc (v4i64 VR256X:$src))), + (v4i32 (EXTRACT_SUBREG + (v8i32 (VPMOVQDZrr (v8i64 (SUBREG_TO_REG (i32 0), + VR256X:$src, sub_ymm)))), sub_xmm))>; +} + +let Predicates = [HasBWI, NoVLX] in { +def: Pat<(v16i8 (X86vtrunc (v16i16 VR256X:$src))), + (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (SUBREG_TO_REG (i32 0), + VR256X:$src, sub_ymm))), sub_xmm))>; +} + +multiclass avx512_extend_common<bits<8> opc, string OpcodeStr, + X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo, + X86MemOperand x86memop, PatFrag LdFrag, SDNode OpNode>{ + + defm rr : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src), OpcodeStr ,"$src", "$src", + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src)))>, + EVEX; + + let mayLoad = 1 in { + defm rm : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), + (ins x86memop:$src), OpcodeStr ,"$src", "$src", + (DestInfo.VT (LdFrag addr:$src))>, + EVEX; + } +} + +multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr, SDNode OpNode, + string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { + let Predicates = [HasVLX, HasBWI] in { + defm Z128: avx512_extend_common<opc, OpcodeStr, v8i16x_info, + v16i8x_info, i64mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128; + + defm Z256: avx512_extend_common<opc, OpcodeStr, v16i16x_info, + v16i8x_info, i128mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256; + } + let Predicates = [HasBWI] in { + defm Z : avx512_extend_common<opc, OpcodeStr, v32i16_info, + v32i8x_info, i256mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512; + } +} + +multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr, SDNode OpNode, + string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { + let Predicates = [HasVLX, HasAVX512] in { + defm Z128: avx512_extend_common<opc, OpcodeStr, v4i32x_info, + v16i8x_info, i32mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128; + + defm Z256: avx512_extend_common<opc, OpcodeStr, v8i32x_info, + v16i8x_info, i64mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256; + } + let Predicates = [HasAVX512] in { + defm Z : avx512_extend_common<opc, OpcodeStr, v16i32_info, + v16i8x_info, i128mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512; + } +} + +multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr, SDNode OpNode, + string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { + let Predicates = [HasVLX, HasAVX512] in { + defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info, + v16i8x_info, i16mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128; + + defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info, + v16i8x_info, i32mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256; + } + let Predicates = [HasAVX512] in { + defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info, + v16i8x_info, i64mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512; + } +} + +multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr, SDNode OpNode, + string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> { + let Predicates = [HasVLX, HasAVX512] in { + defm Z128: avx512_extend_common<opc, OpcodeStr, v4i32x_info, + v8i16x_info, i64mem, LdFrag, OpNode>, + EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128; + + defm Z256: avx512_extend_common<opc, OpcodeStr, v8i32x_info, + v8i16x_info, i128mem, LdFrag, OpNode>, + EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256; + } + let Predicates = [HasAVX512] in { + defm Z : avx512_extend_common<opc, OpcodeStr, v16i32_info, + v16i16x_info, i256mem, LdFrag, OpNode>, + EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512; + } +} + +multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr, SDNode OpNode, + string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> { + let Predicates = [HasVLX, HasAVX512] in { + defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info, + v8i16x_info, i32mem, LdFrag, OpNode>, + EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128; + + defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info, + v8i16x_info, i64mem, LdFrag, OpNode>, + EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256; + } + let Predicates = [HasAVX512] in { + defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info, + v8i16x_info, i128mem, LdFrag, OpNode>, + EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512; + } +} + +multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr, SDNode OpNode, + string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> { + + let Predicates = [HasVLX, HasAVX512] in { + defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info, + v4i32x_info, i64mem, LdFrag, OpNode>, + EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128; + + defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info, + v4i32x_info, i128mem, LdFrag, OpNode>, + EVEX_CD8<32, CD8VH>, T8PD, EVEX_V256; + } + let Predicates = [HasAVX512] in { + defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info, + v8i32x_info, i256mem, LdFrag, OpNode>, + EVEX_CD8<32, CD8VH>, T8PD, EVEX_V512; + } +} + +defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, "z">; +defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, "z">; +defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, "z">; +defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, "z">; +defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, "z">; +defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, "z">; + + +defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, "s">; +defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, "s">; +defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, "s">; +defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, "s">; +defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, "s">; +defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, "s">; + +//===----------------------------------------------------------------------===// +// GATHER - SCATTER Operations + +multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86MemOperand memop, PatFrag GatherNode> { + let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb", + ExeDomain = _.ExeDomain in + def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, _.KRCWM:$mask_wb), + (ins _.RC:$src1, _.KRCWM:$mask, memop:$src2), + !strconcat(OpcodeStr#_.Suffix, + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), + [(set _.RC:$dst, _.KRCWM:$mask_wb, + (GatherNode (_.VT _.RC:$src1), _.KRCWM:$mask, + vectoraddr:$src2))]>, EVEX, EVEX_K, + EVEX_CD8<_.EltSize, CD8VT1>; +} + +multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc, + AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { + defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, + vy32xmem, mgatherv8i32>, EVEX_V512, VEX_W; + defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info512, + vz64mem, mgatherv8i64>, EVEX_V512, VEX_W; +let Predicates = [HasVLX] in { + defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256, + vx32xmem, mgatherv4i32>, EVEX_V256, VEX_W; + defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info256, + vy64xmem, mgatherv4i64>, EVEX_V256, VEX_W; + defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128, + vx32xmem, mgatherv4i32>, EVEX_V128, VEX_W; + defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128, + vx64xmem, mgatherv2i64>, EVEX_V128, VEX_W; +} +} + +multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc, + AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { + defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz32mem, + mgatherv16i32>, EVEX_V512; + defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz64mem, + mgatherv8i64>, EVEX_V512; +let Predicates = [HasVLX] in { + defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256, + vy32xmem, mgatherv8i32>, EVEX_V256; + defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info128, + vy64xmem, mgatherv4i64>, EVEX_V256; + defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128, + vx32xmem, mgatherv4i32>, EVEX_V128; + defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128, + vx64xmem, mgatherv2i64>, EVEX_V128; +} +} + + +defm VGATHER : avx512_gather_q_pd<0x92, 0x93, avx512vl_f64_info, "vgather", "PD">, + avx512_gather_d_ps<0x92, 0x93, avx512vl_f32_info, "vgather", "PS">; + +defm VPGATHER : avx512_gather_q_pd<0x90, 0x91, avx512vl_i64_info, "vpgather", "Q">, + avx512_gather_d_ps<0x90, 0x91, avx512vl_i32_info, "vpgather", "D">; + +multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86MemOperand memop, PatFrag ScatterNode> { + +let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in + + def mr : AVX5128I<opc, MRMDestMem, (outs _.KRCWM:$mask_wb), + (ins memop:$dst, _.KRCWM:$mask, _.RC:$src), + !strconcat(OpcodeStr#_.Suffix, + "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), + [(set _.KRCWM:$mask_wb, (ScatterNode (_.VT _.RC:$src), + _.KRCWM:$mask, vectoraddr:$dst))]>, + EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; +} + +multiclass avx512_scatter_q_pd<bits<8> dopc, bits<8> qopc, + AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { + defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, + vy32xmem, mscatterv8i32>, EVEX_V512, VEX_W; + defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info512, + vz64mem, mscatterv8i64>, EVEX_V512, VEX_W; +let Predicates = [HasVLX] in { + defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256, + vx32xmem, mscatterv4i32>, EVEX_V256, VEX_W; + defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info256, + vy64xmem, mscatterv4i64>, EVEX_V256, VEX_W; + defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128, + vx32xmem, mscatterv4i32>, EVEX_V128, VEX_W; + defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128, + vx64xmem, mscatterv2i64>, EVEX_V128, VEX_W; +} +} + +multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc, + AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { + defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz32mem, + mscatterv16i32>, EVEX_V512; + defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz64mem, + mscatterv8i64>, EVEX_V512; +let Predicates = [HasVLX] in { + defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256, + vy32xmem, mscatterv8i32>, EVEX_V256; + defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info128, + vy64xmem, mscatterv4i64>, EVEX_V256; + defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128, + vx32xmem, mscatterv4i32>, EVEX_V128; + defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128, + vx64xmem, mscatterv2i64>, EVEX_V128; +} +} + +defm VSCATTER : avx512_scatter_q_pd<0xA2, 0xA3, avx512vl_f64_info, "vscatter", "PD">, + avx512_scatter_d_ps<0xA2, 0xA3, avx512vl_f32_info, "vscatter", "PS">; + +defm VPSCATTER : avx512_scatter_q_pd<0xA0, 0xA1, avx512vl_i64_info, "vpscatter", "Q">, + avx512_scatter_d_ps<0xA0, 0xA1, avx512vl_i32_info, "vpscatter", "D">; + +// prefetch +multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr, + RegisterClass KRC, X86MemOperand memop> { + let Predicates = [HasPFI], hasSideEffects = 1 in + def m : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src), + !strconcat(OpcodeStr, "\t{$src {${mask}}|{${mask}}, $src}"), + []>, EVEX, EVEX_K; +} + +defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps", + VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps", + VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; + +defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd", + VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>; + +defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd", + VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; + +defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps", + VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps", + VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; + +defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd", + VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>; + +defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd", + VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; + +defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps", + VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps", + VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; + +defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd", + VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>; + +defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd", + VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; + +defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps", + VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps", + VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; + +defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd", + VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>; + +defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd", + VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; + +// Helper fragments to match sext vXi1 to vXiY. +def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>; +def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>; + +def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; +def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; +def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; + +def : Pat<(store VK1:$src, addr:$dst), + (MOV8mr addr:$dst, + (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), + sub_8bit))>, Requires<[HasAVX512, NoDQI]>; + +def : Pat<(store VK8:$src, addr:$dst), + (MOV8mr addr:$dst, + (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), + sub_8bit))>, Requires<[HasAVX512, NoDQI]>; + +def truncstorei1 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i1; +}]>; + +def : Pat<(truncstorei1 GR8:$src, addr:$dst), + (MOV8mr addr:$dst, GR8:$src)>; + +multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > { +def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src), + !strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"), + [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX; +} + +multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo, + string OpcodeStr, Predicate prd> { +let Predicates = [prd] in + defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256; + defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128; + } +} + +multiclass avx512_convert_mask_to_vector<string OpcodeStr> { + defm NAME##B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, OpcodeStr, + HasBWI>; + defm NAME##W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, OpcodeStr, + HasBWI>, VEX_W; + defm NAME##D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, OpcodeStr, + HasDQI>; + defm NAME##Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, OpcodeStr, + HasDQI>, VEX_W; +} + +defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">; + +multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > { +def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX; +} + +multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { +let Predicates = [prd] in + defm Z : convert_vector_to_mask_common <opc, VTInfo.info512, OpcodeStr>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : convert_vector_to_mask_common<opc, VTInfo.info256, OpcodeStr>, + EVEX_V256; + defm Z128 : convert_vector_to_mask_common<opc, VTInfo.info128, OpcodeStr>, + EVEX_V128; + } +} + +defm VPMOVB2M : avx512_convert_vector_to_mask<0x29, "vpmovb2m", + avx512vl_i8_info, HasBWI>; +defm VPMOVW2M : avx512_convert_vector_to_mask<0x29, "vpmovw2m", + avx512vl_i16_info, HasBWI>, VEX_W; +defm VPMOVD2M : avx512_convert_vector_to_mask<0x39, "vpmovd2m", + avx512vl_i32_info, HasDQI>; +defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m", + avx512vl_i64_info, HasDQI>, VEX_W; + +//===----------------------------------------------------------------------===// +// AVX-512 - COMPRESS and EXPAND +// + +multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _, + string OpcodeStr> { + defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst), + (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", + (_.VT (X86compress _.RC:$src1))>, AVX5128IBase; + + let mayStore = 1 in { + def mr : AVX5128I<opc, MRMDestMem, (outs), + (ins _.MemOp:$dst, _.RC:$src), + OpcodeStr # "\t{$src, $dst |$dst, $src}", + []>, EVEX_CD8<_.EltSize, CD8VT1>; + + def mrk : AVX5128I<opc, MRMDestMem, (outs), + (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src), + OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", + [(store (_.VT (vselect _.KRCWM:$mask, + (_.VT (X86compress _.RC:$src)), _.ImmAllZerosV)), + addr:$dst)]>, + EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; + } +} + +multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + defm Z : compress_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : compress_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256; + defm Z128 : compress_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128; + } +} + +defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", avx512vl_i32_info>, + EVEX; +defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", avx512vl_i64_info>, + EVEX, VEX_W; +defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", avx512vl_f32_info>, + EVEX; +defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info>, + EVEX, VEX_W; + +// expand +multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _, + string OpcodeStr> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", + (_.VT (X86expand _.RC:$src1))>, AVX5128IBase; + + let mayLoad = 1 in + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1", + (_.VT (X86expand (_.VT (bitconvert + (_.LdFrag addr:$src1)))))>, + AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>; +} + +multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256; + defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128; + } +} + +defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", avx512vl_i32_info>, + EVEX; +defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", avx512vl_i64_info>, + EVEX, VEX_W; +defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>, + EVEX; +defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>, + EVEX, VEX_W; + +//handle instruction reg_vec1 = op(reg_vec,imm) +// op(mem_vec,imm) +// op(broadcast(eltVt),imm) +//all instruction created with FROUND_CURRENT +multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _>{ + defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2, $src1", "$src2, $src2", + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2), + (i32 FROUND_CURRENT))>; + let mayLoad = 1 in { + defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", + (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2), + (i32 FROUND_CURRENT))>; + defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr, + "${src1}"##_.BroadcastStr##", $src2", + (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))), + (i32 imm:$src2), + (i32 FROUND_CURRENT))>, EVEX_B; + } +} + +//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} +multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86VectorVTInfo _>{ + defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2,{sae}, $src1", + "$src1, {sae}, $src2", + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2), + (i32 FROUND_NO_EXC))>, EVEX_B; +} + +multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr, + AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{ + let Predicates = [prd] in { + defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>, + avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>, + EVEX_V512; + } + let Predicates = [prd, HasVLX] in { + defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>, + EVEX_V128; + defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>, + EVEX_V256; + } +} + +//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) +// op(reg_vec2,mem_vec,imm) +// op(reg_vec2,broadcast(eltVt),imm) +//all instruction created with FROUND_CURRENT +multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _>{ + defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 imm:$src3), + (i32 FROUND_CURRENT))>; + let mayLoad = 1 in { + defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + (i32 imm:$src3), + (i32 FROUND_CURRENT))>; + defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), + OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr##", $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + (i32 imm:$src3), + (i32 FROUND_CURRENT))>, EVEX_B; + } +} + +//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) +// op(reg_vec2,mem_vec,imm) +multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo>{ + + defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1), + (SrcInfo.VT SrcInfo.RC:$src2), + (i8 imm:$src3)))>; + let mayLoad = 1 in + defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1), + (SrcInfo.VT (bitconvert + (SrcInfo.LdFrag addr:$src2))), + (i8 imm:$src3)))>; +} + +//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) +// op(reg_vec2,mem_vec,imm) +// op(reg_vec2,broadcast(eltVt),imm) +multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _>: + avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, _, _>{ + + let mayLoad = 1 in + defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), + OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr##", $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + (i8 imm:$src3))>, EVEX_B; +} + +//handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) +// op(reg_vec2,mem_scalar,imm) +//all instruction created with FROUND_CURRENT +multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + + defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 imm:$src3), + (i32 FROUND_CURRENT))>; + let mayLoad = 1 in { + defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector + (_.ScalarLdFrag addr:$src2))), + (i32 imm:$src3), + (i32 FROUND_CURRENT))>; + + let isAsmParserOnly = 1 in { + defm rmi_alt :AVX512_maskable_in_asm<opc, MRMSrcMem, _, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + []>; + } + } +} + +//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} +multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86VectorVTInfo _>{ + defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), + OpcodeStr, "$src3,{sae}, $src2, $src1", + "$src1, $src2,{sae}, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 imm:$src3), + (i32 FROUND_NO_EXC))>, EVEX_B; +} +//handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} +multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86VectorVTInfo _> { + defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), + OpcodeStr, "$src3,{sae}, $src2, $src1", + "$src1, $src2,{sae}, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 imm:$src3), + (i32 FROUND_NO_EXC))>, EVEX_B; +} + +multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr, + AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{ + let Predicates = [prd] in { + defm Z : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>, + avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>, + EVEX_V512; + + } + let Predicates = [prd, HasVLX] in { + defm Z128 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>, + EVEX_V128; + defm Z256 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>, + EVEX_V256; + } +} + +multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr, + AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo>{ + let Predicates = [HasBWI] in { + defm Z : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info512, + SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V; + } + let Predicates = [HasBWI, HasVLX] in { + defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info128, + SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V; + defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info256, + SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V; + } +} + +multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _, + bits<8> opc, SDNode OpNode>{ + let Predicates = [HasAVX512] in { + defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512; + } + let Predicates = [HasAVX512, HasVLX] in { + defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128; + defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256; + } +} + +multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr, + X86VectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{ + let Predicates = [prd] in { + defm Z128 : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, _>, + avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNode, _>; + } +} + +multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr, + bits<8> opcPs, bits<8> opcPd, SDNode OpNode, Predicate prd>{ + defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info, + opcPs, OpNode, prd>, EVEX_CD8<32, CD8VF>; + defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info, + opcPd, OpNode, prd>, EVEX_CD8<64, CD8VF>, VEX_W; +} + +defm VFIXUPIMMPD : avx512_common_fp_sae_packed_imm<"vfixupimmpd", + avx512vl_f64_info, 0x54, X86VFixupimm, HasAVX512>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; +defm VFIXUPIMMPS : avx512_common_fp_sae_packed_imm<"vfixupimmps", + avx512vl_f32_info, 0x54, X86VFixupimm, HasAVX512>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; + +defm VFIXUPIMMSD: avx512_common_fp_sae_scalar_imm<"vfixupimmsd", f64x_info, + 0x55, X86VFixupimm, HasAVX512>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; +defm VFIXUPIMMSS: avx512_common_fp_sae_scalar_imm<"vfixupimmss", f32x_info, + 0x55, X86VFixupimm, HasAVX512>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; + +defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56, + X86VReduce, HasDQI>, AVX512AIi8Base, EVEX; +defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09, + X86VRndScale, HasAVX512>, AVX512AIi8Base, EVEX; +defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26, + X86VGetMant, HasAVX512>, AVX512AIi8Base, EVEX; + + +defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info, + 0x50, X86VRange, HasDQI>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; +defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info, + 0x50, X86VRange, HasDQI>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; + +defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd", f64x_info, + 0x51, X86VRange, HasDQI>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; +defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info, + 0x51, X86VRange, HasDQI>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; + +defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info, + 0x57, X86Reduces, HasDQI>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; +defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info, + 0x57, X86Reduces, HasDQI>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; + +defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info, + 0x27, X86GetMants, HasAVX512>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; +defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info, + 0x27, X86GetMants, HasAVX512>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; + +multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _, + bits<8> opc, SDNode OpNode = X86Shuf128>{ + let Predicates = [HasAVX512] in { + defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512; + + } + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256; + } +} +let Predicates = [HasAVX512] in { +def : Pat<(v16f32 (ffloor VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0x1))>; +def : Pat<(v16f32 (fnearbyint VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>; +def : Pat<(v16f32 (fceil VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0x2))>; +def : Pat<(v16f32 (frint VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>; +def : Pat<(v16f32 (ftrunc VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0x3))>; + +def : Pat<(v8f64 (ffloor VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0x1))>; +def : Pat<(v8f64 (fnearbyint VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>; +def : Pat<(v8f64 (fceil VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0x2))>; +def : Pat<(v8f64 (frint VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>; +def : Pat<(v8f64 (ftrunc VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0x3))>; +} + +defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; +defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2",avx512vl_f64_info, 0x23>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; +defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4",avx512vl_i32_info, 0x43>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; +defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; + +multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I> { + defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>, + AVX512AIi8Base, EVEX_4V; +} + +defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info>, + EVEX_CD8<32, CD8VF>; +defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info>, + EVEX_CD8<64, CD8VF>, VEX_W; + +multiclass avx512_vpalign_lowering<X86VectorVTInfo _ , list<Predicate> p>{ + let Predicates = p in + def NAME#_.VTName#rri: + Pat<(_.VT (X86PAlignr _.RC:$src1, _.RC:$src2, (i8 imm:$imm))), + (!cast<Instruction>(NAME#_.ZSuffix#rri) + _.RC:$src1, _.RC:$src2, imm:$imm)>; +} + +multiclass avx512_vpalign_lowering_common<AVX512VLVectorVTInfo _>: + avx512_vpalign_lowering<_.info512, [HasBWI]>, + avx512_vpalign_lowering<_.info128, [HasBWI, HasVLX]>, + avx512_vpalign_lowering<_.info256, [HasBWI, HasVLX]>; + +defm VPALIGN: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" , + avx512vl_i8_info, avx512vl_i8_info>, + avx512_vpalign_lowering_common<avx512vl_i16_info>, + avx512_vpalign_lowering_common<avx512vl_i32_info>, + avx512_vpalign_lowering_common<avx512vl_f32_info>, + avx512_vpalign_lowering_common<avx512vl_i64_info>, + avx512_vpalign_lowering_common<avx512vl_f64_info>, + EVEX_CD8<8, CD8VF>; + +defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" , + avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>; + +multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1), OpcodeStr, + "$src1", "$src1", + (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase; + + let mayLoad = 1 in + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src1), OpcodeStr, + "$src1", "$src1", + (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>, + EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>; +} + +multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> : + avx512_unary_rm<opc, OpcodeStr, OpNode, _> { + let mayLoad = 1 in + defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src1), OpcodeStr, + "${src1}"##_.BroadcastStr, + "${src1}"##_.BroadcastStr, + (_.VT (OpNode (X86VBroadcast + (_.ScalarLdFrag addr:$src1))))>, + EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; +} + +multiclass avx512_unary_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info128>, + EVEX_V128; + } +} + +multiclass avx512_unary_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info512>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info128>, + EVEX_V128; + } +} + +multiclass avx512_unary_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr, + SDNode OpNode, Predicate prd> { + defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, avx512vl_i64_info, + prd>, VEX_W; + defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, avx512vl_i32_info, + prd>; +} + +multiclass avx512_unary_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr, + SDNode OpNode, Predicate prd> { + defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, avx512vl_i16_info, prd>; + defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, avx512vl_i8_info, prd>; +} + +multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w, + bits<8> opc_d, bits<8> opc_q, + string OpcodeStr, SDNode OpNode> { + defm NAME : avx512_unary_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode, + HasAVX512>, + avx512_unary_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode, + HasBWI>; +} + +defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", X86Abs>; + +def : Pat<(xor + (bc_v16i32 (v16i1sextv16i32)), + (bc_v16i32 (add (v16i32 VR512:$src), (v16i1sextv16i32)))), + (VPABSDZrr VR512:$src)>; +def : Pat<(xor + (bc_v8i64 (v8i1sextv8i64)), + (bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))), + (VPABSQZrr VR512:$src)>; + +multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{ + + defm NAME : avx512_unary_rm_vl_dq<opc, opc, OpcodeStr, ctlz, prd>; +} + +defm VPLZCNT : avx512_ctlz<0x44, "vplzcnt", HasCDI>; +defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict, HasCDI>; + +//===---------------------------------------------------------------------===// +// Replicate Single FP - MOVSHDUP and MOVSLDUP +//===---------------------------------------------------------------------===// +multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode>{ + defm NAME: avx512_unary_rm_vl<opc, OpcodeStr, OpNode, avx512vl_f32_info, + HasAVX512>, XS; +} + +defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup>; +defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>; + +//===----------------------------------------------------------------------===// +// AVX-512 - MOVDDUP +//===----------------------------------------------------------------------===// + +multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, "$src", "$src", + (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX; + let mayLoad = 1 in + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src", + (_.VT (OpNode (_.VT (scalar_to_vector + (_.ScalarLdFrag addr:$src)))))>, + EVEX, EVEX_CD8<_.EltSize, CD8VH>; +} + +multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo> { + + defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512; + + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_movddup_128<opc, OpcodeStr, OpNode, VTInfo.info128>, + EVEX_V128; + } +} + +multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode>{ + defm NAME: avx512_movddup_common<opc, OpcodeStr, OpNode, + avx512vl_f64_info>, XD, VEX_W; +} + +defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>; + +def : Pat<(X86Movddup (loadv2f64 addr:$src)), + (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>; +def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>; + +//===----------------------------------------------------------------------===// +// AVX-512 - Unpack Instructions +//===----------------------------------------------------------------------===// +defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh>; +defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl>; + +defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl, + SSE_INTALU_ITINS_P, HasBWI>; +defm VPUNPCKHBW : avx512_binop_rm_vl_b<0x68, "vpunpckhbw", X86Unpckh, + SSE_INTALU_ITINS_P, HasBWI>; +defm VPUNPCKLWD : avx512_binop_rm_vl_w<0x61, "vpunpcklwd", X86Unpckl, + SSE_INTALU_ITINS_P, HasBWI>; +defm VPUNPCKHWD : avx512_binop_rm_vl_w<0x69, "vpunpckhwd", X86Unpckh, + SSE_INTALU_ITINS_P, HasBWI>; + +defm VPUNPCKLDQ : avx512_binop_rm_vl_d<0x62, "vpunpckldq", X86Unpckl, + SSE_INTALU_ITINS_P, HasAVX512>; +defm VPUNPCKHDQ : avx512_binop_rm_vl_d<0x6A, "vpunpckhdq", X86Unpckh, + SSE_INTALU_ITINS_P, HasAVX512>; +defm VPUNPCKLQDQ : avx512_binop_rm_vl_q<0x6C, "vpunpcklqdq", X86Unpckl, + SSE_INTALU_ITINS_P, HasAVX512>; +defm VPUNPCKHQDQ : avx512_binop_rm_vl_q<0x6D, "vpunpckhqdq", X86Unpckh, + SSE_INTALU_ITINS_P, HasAVX512>; + +//===----------------------------------------------------------------------===// +// AVX-512 - Extract & Insert Integer Instructions +//===----------------------------------------------------------------------===// + +multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let mayStore = 1 in + def mr : AVX512Ii8<opc, MRMDestMem, (outs), + (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(store (_.EltVT (trunc (assertzext (OpNode (_.VT _.RC:$src1), + imm:$src2)))), + addr:$dst)]>, + EVEX, EVEX_CD8<_.EltSize, CD8VT1>; +} + +multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> { + let Predicates = [HasBWI] in { + def rr : AVX512Ii8<0x14, MRMDestReg, (outs GR32orGR64:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32orGR64:$dst, + (X86pextrb (_.VT _.RC:$src1), imm:$src2))]>, + EVEX, TAPD; + + defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD; + } +} + +multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> { + let Predicates = [HasBWI] in { + def rr : AVX512Ii8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32orGR64:$dst, + (X86pextrw (_.VT _.RC:$src1), imm:$src2))]>, + EVEX, PD; + + def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + EVEX, TAPD; + + defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD; + } +} + +multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _, + RegisterClass GRC> { + let Predicates = [HasDQI] in { + def rr : AVX512Ii8<0x16, MRMDestReg, (outs GRC:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GRC:$dst, + (extractelt (_.VT _.RC:$src1), imm:$src2))]>, + EVEX, TAPD; + + let mayStore = 1 in + def mr : AVX512Ii8<0x16, MRMDestMem, (outs), + (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(store (extractelt (_.VT _.RC:$src1), + imm:$src2),addr:$dst)]>, + EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD; + } +} + +defm VPEXTRBZ : avx512_extract_elt_b<"vpextrb", v16i8x_info>; +defm VPEXTRWZ : avx512_extract_elt_w<"vpextrw", v8i16x_info>; +defm VPEXTRDZ : avx512_extract_elt_dq<"vpextrd", v4i32x_info, GR32>; +defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, VEX_W; + +multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, PatFrag LdFrag> { + def rm : AVX512Ii8<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), + OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set _.RC:$dst, + (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), imm:$src3)))]>, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; +} + +multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, PatFrag LdFrag> { + let Predicates = [HasBWI] in { + def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3), + OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set _.RC:$dst, + (OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V; + + defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag>; + } +} + +multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, RegisterClass GRC> { + let Predicates = [HasDQI] in { + def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, GRC:$src2, u8imm:$src3), + OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set _.RC:$dst, + (_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>, + EVEX_4V, TAPD; + + defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _, + _.ScalarLdFrag>, TAPD; + } +} + +defm VPINSRBZ : avx512_insert_elt_bw<0x20, "vpinsrb", X86pinsrb, v16i8x_info, + extloadi8>, TAPD; +defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info, + extloadi16>, PD; +defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>; +defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W; +//===----------------------------------------------------------------------===// +// VSHUFPS - VSHUFPD Operations +//===----------------------------------------------------------------------===// +multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I, + AVX512VLVectorVTInfo VTInfo_FP>{ + defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp>, + EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>, + AVX512AIi8Base, EVEX_4V; +} + +defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_i32_info, avx512vl_f32_info>, PS; +defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD, VEX_W; +//===----------------------------------------------------------------------===// +// AVX-512 - Byte shift Left/Right +//===----------------------------------------------------------------------===// + +multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr, + Format MRMm, string OpcodeStr, X86VectorVTInfo _>{ + def rr : AVX512<opc, MRMr, + (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>; + let mayLoad = 1 in + def rm : AVX512<opc, MRMm, + (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst,(_.VT (OpNode + (_.LdFrag addr:$src1), (i8 imm:$src2))))]>; +} + +multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr, + Format MRMm, string OpcodeStr, Predicate prd>{ + let Predicates = [prd] in + defm Z512 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, + OpcodeStr, v8i64_info>, EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, + OpcodeStr, v4i64x_info>, EVEX_V256; + defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, + OpcodeStr, v2i64x_info>, EVEX_V128; + } +} +defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq", + HasBWI>, AVX512PDIi8Base, EVEX_4V; +defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", + HasBWI>, AVX512PDIi8Base, EVEX_4V; + + +multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode, + string OpcodeStr, X86VectorVTInfo _dst, + X86VectorVTInfo _src>{ + def rr : AVX512BI<opc, MRMSrcReg, + (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _dst.RC:$dst,(_dst.VT + (OpNode (_src.VT _src.RC:$src1), + (_src.VT _src.RC:$src2))))]>; + let mayLoad = 1 in + def rm : AVX512BI<opc, MRMSrcMem, + (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _dst.RC:$dst,(_dst.VT + (OpNode (_src.VT _src.RC:$src1), + (_src.VT (bitconvert + (_src.LdFrag addr:$src2))))))]>; +} + +multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode, + string OpcodeStr, Predicate prd> { + let Predicates = [prd] in + defm Z512 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v8i64_info, + v64i8_info>, EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v4i64x_info, + v32i8x_info>, EVEX_V256; + defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v2i64x_info, + v16i8x_info>, EVEX_V128; + } +} + +defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw", + HasBWI>, EVEX_4V; + +multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _>{ + let Constraints = "$src1 = $dst" in { + defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, u8imm:$src4), + OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_.VT _.RC:$src3), + (i8 imm:$src4))>, AVX512AIi8Base, EVEX_4V; + let mayLoad = 1 in { + defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4), + OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_.VT (bitconvert (_.LdFrag addr:$src3))), + (i8 imm:$src4))>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4), + OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2", + "$src2, ${src3}"##_.BroadcastStr##", $src4", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + (i8 imm:$src4))>, EVEX_B, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + } + }// Constraints = "$src1 = $dst" +} + +multiclass avx512_common_ternlog<string OpcodeStr, AVX512VLVectorVTInfo _>{ + let Predicates = [HasAVX512] in + defm Z : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info512>, EVEX_V512; + let Predicates = [HasAVX512, HasVLX] in { + defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info128>, EVEX_V128; + defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info256>, EVEX_V256; + } +} + +defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", avx512vl_i32_info>; +defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", avx512vl_i64_info>, VEX_W; + diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td new file mode 100644 index 0000000..1a2e786 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -0,0 +1,1375 @@ +//===-- X86InstrArithmetic.td - Integer Arithmetic Instrs --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the integer arithmetic instructions in the X86 +// architecture. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// LEA - Load Effective Address +let SchedRW = [WriteLEA] in { +let hasSideEffects = 0 in +def LEA16r : I<0x8D, MRMSrcMem, + (outs GR16:$dst), (ins anymem:$src), + "lea{w}\t{$src|$dst}, {$dst|$src}", [], IIC_LEA_16>, OpSize16; +let isReMaterializable = 1 in +def LEA32r : I<0x8D, MRMSrcMem, + (outs GR32:$dst), (ins anymem:$src), + "lea{l}\t{$src|$dst}, {$dst|$src}", + [(set GR32:$dst, lea32addr:$src)], IIC_LEA>, + OpSize32, Requires<[Not64BitMode]>; + +def LEA64_32r : I<0x8D, MRMSrcMem, + (outs GR32:$dst), (ins lea64_32mem:$src), + "lea{l}\t{$src|$dst}, {$dst|$src}", + [(set GR32:$dst, lea64_32addr:$src)], IIC_LEA>, + OpSize32, Requires<[In64BitMode]>; + +let isReMaterializable = 1 in +def LEA64r : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src), + "lea{q}\t{$src|$dst}, {$dst|$src}", + [(set GR64:$dst, lea64addr:$src)], IIC_LEA>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Fixed-Register Multiplication and Division Instructions. +// + +// SchedModel info for instruction that loads one value and gets the second +// (and possibly third) value from a register. +// This is used for instructions that put the memory operands before other +// uses. +class SchedLoadReg<SchedWrite SW> : Sched<[SW, + // Memory operand. + ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, + // Register reads (implicit or explicit). + ReadAfterLd, ReadAfterLd]>; + +// Extra precision multiplication + +// AL is really implied by AX, but the registers in Defs must match the +// SDNode results (i8, i32). +// AL,AH = AL*GR8 +let Defs = [AL,EFLAGS,AX], Uses = [AL] in +def MUL8r : I<0xF6, MRM4r, (outs), (ins GR8:$src), "mul{b}\t$src", + // FIXME: Used for 8-bit mul, ignore result upper 8 bits. + // This probably ought to be moved to a def : Pat<> if the + // syntax can be accepted. + [(set AL, (mul AL, GR8:$src)), + (implicit EFLAGS)], IIC_MUL8>, Sched<[WriteIMul]>; +// AX,DX = AX*GR16 +let Defs = [AX,DX,EFLAGS], Uses = [AX], hasSideEffects = 0 in +def MUL16r : I<0xF7, MRM4r, (outs), (ins GR16:$src), + "mul{w}\t$src", + [], IIC_MUL16_REG>, OpSize16, Sched<[WriteIMul]>; +// EAX,EDX = EAX*GR32 +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], hasSideEffects = 0 in +def MUL32r : I<0xF7, MRM4r, (outs), (ins GR32:$src), + "mul{l}\t$src", + [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/], + IIC_MUL32_REG>, OpSize32, Sched<[WriteIMul]>; +// RAX,RDX = RAX*GR64 +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], hasSideEffects = 0 in +def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src), + "mul{q}\t$src", + [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/], + IIC_MUL64>, Sched<[WriteIMul]>; +// AL,AH = AL*[mem8] +let Defs = [AL,EFLAGS,AX], Uses = [AL] in +def MUL8m : I<0xF6, MRM4m, (outs), (ins i8mem :$src), + "mul{b}\t$src", + // FIXME: Used for 8-bit mul, ignore result upper 8 bits. + // This probably ought to be moved to a def : Pat<> if the + // syntax can be accepted. + [(set AL, (mul AL, (loadi8 addr:$src))), + (implicit EFLAGS)], IIC_MUL8>, SchedLoadReg<WriteIMulLd>; +// AX,DX = AX*[mem16] +let mayLoad = 1, hasSideEffects = 0 in { +let Defs = [AX,DX,EFLAGS], Uses = [AX] in +def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src), + "mul{w}\t$src", + [], IIC_MUL16_MEM>, OpSize16, SchedLoadReg<WriteIMulLd>; +// EAX,EDX = EAX*[mem32] +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in +def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src), + "mul{l}\t$src", + [], IIC_MUL32_MEM>, OpSize32, SchedLoadReg<WriteIMulLd>; +// RAX,RDX = RAX*[mem64] +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in +def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src), + "mul{q}\t$src", [], IIC_MUL64>, SchedLoadReg<WriteIMulLd>; +} + +let hasSideEffects = 0 in { +// AL,AH = AL*GR8 +let Defs = [AL,EFLAGS,AX], Uses = [AL] in +def IMUL8r : I<0xF6, MRM5r, (outs), (ins GR8:$src), "imul{b}\t$src", [], + IIC_IMUL8>, Sched<[WriteIMul]>; +// AX,DX = AX*GR16 +let Defs = [AX,DX,EFLAGS], Uses = [AX] in +def IMUL16r : I<0xF7, MRM5r, (outs), (ins GR16:$src), "imul{w}\t$src", [], + IIC_IMUL16_RR>, OpSize16, Sched<[WriteIMul]>; +// EAX,EDX = EAX*GR32 +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in +def IMUL32r : I<0xF7, MRM5r, (outs), (ins GR32:$src), "imul{l}\t$src", [], + IIC_IMUL32_RR>, OpSize32, Sched<[WriteIMul]>; +// RAX,RDX = RAX*GR64 +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in +def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", [], + IIC_IMUL64_RR>, Sched<[WriteIMul]>; + +let mayLoad = 1 in { +// AL,AH = AL*[mem8] +let Defs = [AL,EFLAGS,AX], Uses = [AL] in +def IMUL8m : I<0xF6, MRM5m, (outs), (ins i8mem :$src), + "imul{b}\t$src", [], IIC_IMUL8>, SchedLoadReg<WriteIMulLd>; +// AX,DX = AX*[mem16] +let Defs = [AX,DX,EFLAGS], Uses = [AX] in +def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src), + "imul{w}\t$src", [], IIC_IMUL16_MEM>, OpSize16, + SchedLoadReg<WriteIMulLd>; +// EAX,EDX = EAX*[mem32] +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in +def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src), + "imul{l}\t$src", [], IIC_IMUL32_MEM>, OpSize32, + SchedLoadReg<WriteIMulLd>; +// RAX,RDX = RAX*[mem64] +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in +def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src), + "imul{q}\t$src", [], IIC_IMUL64>, SchedLoadReg<WriteIMulLd>; +} +} // hasSideEffects + + +let Defs = [EFLAGS] in { +let Constraints = "$src1 = $dst" in { + +let isCommutable = 1, SchedRW = [WriteIMul] in { +// X = IMUL Y, Z --> X = IMUL Z, Y +// Register-Register Signed Integer Multiply +def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2), + "imul{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, GR16:$src2))], IIC_IMUL16_RR>, + TB, OpSize16; +def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2), + "imul{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, GR32:$src2))], IIC_IMUL32_RR>, + TB, OpSize32; +def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2), + "imul{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, GR64:$src2))], IIC_IMUL64_RR>, + TB; +} // isCommutable, SchedRW + +// Register-Memory Signed Integer Multiply +let SchedRW = [WriteIMulLd, ReadAfterLd] in { +def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst), + (ins GR16:$src1, i16mem:$src2), + "imul{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, (load addr:$src2)))], + IIC_IMUL16_RM>, + TB, OpSize16; +def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$src1, i32mem:$src2), + "imul{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, (load addr:$src2)))], + IIC_IMUL32_RM>, + TB, OpSize32; +def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$src1, i64mem:$src2), + "imul{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, (load addr:$src2)))], + IIC_IMUL64_RM>, + TB; +} // SchedRW +} // Constraints = "$src1 = $dst" + +} // Defs = [EFLAGS] + +// Surprisingly enough, these are not two address instructions! +let Defs = [EFLAGS] in { +let SchedRW = [WriteIMul] in { +// Register-Integer Signed Integer Multiply +def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16 + (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, imm:$src2))], + IIC_IMUL16_RRI>, OpSize16; +def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8 + (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, i16immSExt8:$src2))], + IIC_IMUL16_RRI>, OpSize16; +def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32 + (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), + "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, imm:$src2))], + IIC_IMUL32_RRI>, OpSize32; +def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8 + (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), + "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, i32immSExt8:$src2))], + IIC_IMUL32_RRI>, OpSize32; +def IMUL64rri32 : RIi32S<0x69, MRMSrcReg, // GR64 = GR64*I32 + (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, i64immSExt32:$src2))], + IIC_IMUL64_RRI>; +def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8 + (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, i64immSExt8:$src2))], + IIC_IMUL64_RRI>; +} // SchedRW + +// Memory-Integer Signed Integer Multiply +let SchedRW = [WriteIMulLd] in { +def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16 + (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), imm:$src2))], + IIC_IMUL16_RMI>, + OpSize16; +def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8 + (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), + i16immSExt8:$src2))], IIC_IMUL16_RMI>, + OpSize16; +def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32 + (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2), + "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), imm:$src2))], + IIC_IMUL32_RMI>, OpSize32; +def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8 + (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2), + "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), + i32immSExt8:$src2))], + IIC_IMUL32_RMI>, OpSize32; +def IMUL64rmi32 : RIi32S<0x69, MRMSrcMem, // GR64 = [mem64]*I32 + (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), + i64immSExt32:$src2))], + IIC_IMUL64_RMI>; +def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8 + (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), + i64immSExt8:$src2))], + IIC_IMUL64_RMI>; +} // SchedRW +} // Defs = [EFLAGS] + + + + +// unsigned division/remainder +let hasSideEffects = 1 in { // so that we don't speculatively execute +let SchedRW = [WriteIDiv] in { +let Defs = [AL,AH,EFLAGS], Uses = [AX] in +def DIV8r : I<0xF6, MRM6r, (outs), (ins GR8:$src), // AX/r8 = AL,AH + "div{b}\t$src", [], IIC_DIV8_REG>; +let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in +def DIV16r : I<0xF7, MRM6r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX + "div{w}\t$src", [], IIC_DIV16>, OpSize16; +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in +def DIV32r : I<0xF7, MRM6r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX + "div{l}\t$src", [], IIC_DIV32>, OpSize32; +// RDX:RAX/r64 = RAX,RDX +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in +def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src), + "div{q}\t$src", [], IIC_DIV64>; +} // SchedRW + +let mayLoad = 1 in { +let Defs = [AL,AH,EFLAGS], Uses = [AX] in +def DIV8m : I<0xF6, MRM6m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH + "div{b}\t$src", [], IIC_DIV8_MEM>, + SchedLoadReg<WriteIDivLd>; +let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in +def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX + "div{w}\t$src", [], IIC_DIV16>, OpSize16, + SchedLoadReg<WriteIDivLd>; +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX +def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src), + "div{l}\t$src", [], IIC_DIV32>, + SchedLoadReg<WriteIDivLd>, OpSize32; +// RDX:RAX/[mem64] = RAX,RDX +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in +def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src), + "div{q}\t$src", [], IIC_DIV64>, + SchedLoadReg<WriteIDivLd>; +} + +// Signed division/remainder. +let SchedRW = [WriteIDiv] in { +let Defs = [AL,AH,EFLAGS], Uses = [AX] in +def IDIV8r : I<0xF6, MRM7r, (outs), (ins GR8:$src), // AX/r8 = AL,AH + "idiv{b}\t$src", [], IIC_IDIV8>; +let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in +def IDIV16r: I<0xF7, MRM7r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX + "idiv{w}\t$src", [], IIC_IDIV16>, OpSize16; +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in +def IDIV32r: I<0xF7, MRM7r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX + "idiv{l}\t$src", [], IIC_IDIV32>, OpSize32; +// RDX:RAX/r64 = RAX,RDX +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in +def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src), + "idiv{q}\t$src", [], IIC_IDIV64>; +} // SchedRW + +let mayLoad = 1 in { +let Defs = [AL,AH,EFLAGS], Uses = [AX] in +def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH + "idiv{b}\t$src", [], IIC_IDIV8>, + SchedLoadReg<WriteIDivLd>; +let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in +def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX + "idiv{w}\t$src", [], IIC_IDIV16>, OpSize16, + SchedLoadReg<WriteIDivLd>; +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX +def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src), + "idiv{l}\t$src", [], IIC_IDIV32>, OpSize32, + SchedLoadReg<WriteIDivLd>; +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX +def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src), + "idiv{q}\t$src", [], IIC_IDIV64>, + SchedLoadReg<WriteIDivLd>; +} +} // hasSideEffects = 0 + +//===----------------------------------------------------------------------===// +// Two address Instructions. +// + +// unary instructions +let CodeSize = 2 in { +let Defs = [EFLAGS] in { +let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in { +def NEG8r : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src1), + "neg{b}\t$dst", + [(set GR8:$dst, (ineg GR8:$src1)), + (implicit EFLAGS)], IIC_UNARY_REG>; +def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src1), + "neg{w}\t$dst", + [(set GR16:$dst, (ineg GR16:$src1)), + (implicit EFLAGS)], IIC_UNARY_REG>, OpSize16; +def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src1), + "neg{l}\t$dst", + [(set GR32:$dst, (ineg GR32:$src1)), + (implicit EFLAGS)], IIC_UNARY_REG>, OpSize32; +def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "neg{q}\t$dst", + [(set GR64:$dst, (ineg GR64:$src1)), + (implicit EFLAGS)], IIC_UNARY_REG>; +} // Constraints = "$src1 = $dst", SchedRW + +// Read-modify-write negate. +let SchedRW = [WriteALULd, WriteRMW] in { +def NEG8m : I<0xF6, MRM3m, (outs), (ins i8mem :$dst), + "neg{b}\t$dst", + [(store (ineg (loadi8 addr:$dst)), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>; +def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst), + "neg{w}\t$dst", + [(store (ineg (loadi16 addr:$dst)), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16; +def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst), + "neg{l}\t$dst", + [(store (ineg (loadi32 addr:$dst)), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32; +def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst", + [(store (ineg (loadi64 addr:$dst)), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>; +} // SchedRW +} // Defs = [EFLAGS] + + +// Note: NOT does not set EFLAGS! + +let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in { +// Match xor -1 to not. Favors these over a move imm + xor to save code size. +let AddedComplexity = 15 in { +def NOT8r : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src1), + "not{b}\t$dst", + [(set GR8:$dst, (not GR8:$src1))], IIC_UNARY_REG>; +def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src1), + "not{w}\t$dst", + [(set GR16:$dst, (not GR16:$src1))], IIC_UNARY_REG>, OpSize16; +def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src1), + "not{l}\t$dst", + [(set GR32:$dst, (not GR32:$src1))], IIC_UNARY_REG>, OpSize32; +def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "not{q}\t$dst", + [(set GR64:$dst, (not GR64:$src1))], IIC_UNARY_REG>; +} +} // Constraints = "$src1 = $dst", SchedRW + +let SchedRW = [WriteALULd, WriteRMW] in { +def NOT8m : I<0xF6, MRM2m, (outs), (ins i8mem :$dst), + "not{b}\t$dst", + [(store (not (loadi8 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>; +def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst), + "not{w}\t$dst", + [(store (not (loadi16 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>, + OpSize16; +def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst), + "not{l}\t$dst", + [(store (not (loadi32 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>, + OpSize32; +def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst", + [(store (not (loadi64 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>; +} // SchedRW +} // CodeSize + +// TODO: inc/dec is slow for P4, but fast for Pentium-M. +let Defs = [EFLAGS] in { +let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in { +let CodeSize = 2 in +def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), + "inc{b}\t$dst", + [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))], + IIC_UNARY_REG>; +let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA. +def INC16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1), + "inc{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], + IIC_UNARY_REG>, OpSize16; +def INC32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1), + "inc{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))], + IIC_UNARY_REG>, OpSize32; +def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst", + [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))], + IIC_UNARY_REG>; +} // isConvertibleToThreeAddress = 1, CodeSize = 2 + +// Short forms only valid in 32-bit mode. Selected during MCInst lowering. +let CodeSize = 1, hasSideEffects = 0 in { +def INC16r_alt : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), + "inc{w}\t$dst", [], IIC_UNARY_REG>, + OpSize16, Requires<[Not64BitMode]>; +def INC32r_alt : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), + "inc{l}\t$dst", [], IIC_UNARY_REG>, + OpSize32, Requires<[Not64BitMode]>; +} // CodeSize = 1, hasSideEffects = 0 +} // Constraints = "$src1 = $dst", SchedRW + +let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in { + def INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst", + [(store (add (loadi8 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>; + def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst", + [(store (add (loadi16 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16; + def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst", + [(store (add (loadi32 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32; + def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst", + [(store (add (loadi64 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>; +} // CodeSize = 2, SchedRW + +let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in { +let CodeSize = 2 in +def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), + "dec{b}\t$dst", + [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))], + IIC_UNARY_REG>; +let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA. +def DEC16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1), + "dec{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))], + IIC_UNARY_REG>, OpSize16; +def DEC32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1), + "dec{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))], + IIC_UNARY_REG>, OpSize32; +def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst", + [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))], + IIC_UNARY_REG>; +} // isConvertibleToThreeAddress = 1, CodeSize = 2 + +// Short forms only valid in 32-bit mode. Selected during MCInst lowering. +let CodeSize = 1, hasSideEffects = 0 in { +def DEC16r_alt : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), + "dec{w}\t$dst", [], IIC_UNARY_REG>, + OpSize16, Requires<[Not64BitMode]>; +def DEC32r_alt : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), + "dec{l}\t$dst", [], IIC_UNARY_REG>, + OpSize32, Requires<[Not64BitMode]>; +} // CodeSize = 1, hasSideEffects = 0 +} // Constraints = "$src1 = $dst", SchedRW + + +let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in { + def DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst", + [(store (add (loadi8 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>; + def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst", + [(store (add (loadi16 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16; + def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst", + [(store (add (loadi32 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32; + def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", + [(store (add (loadi64 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)], IIC_UNARY_MEM>; +} // CodeSize = 2, SchedRW +} // Defs = [EFLAGS] + +/// X86TypeInfo - This is a bunch of information that describes relevant X86 +/// information about value types. For example, it can tell you what the +/// register class and preferred load to use. +class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass, + PatFrag loadnode, X86MemOperand memoperand, ImmType immkind, + Operand immoperand, SDPatternOperator immoperator, + Operand imm8operand, SDPatternOperator imm8operator, + bit hasOddOpcode, OperandSize opSize, + bit hasREX_WPrefix> { + /// VT - This is the value type itself. + ValueType VT = vt; + + /// InstrSuffix - This is the suffix used on instructions with this type. For + /// example, i8 -> "b", i16 -> "w", i32 -> "l", i64 -> "q". + string InstrSuffix = instrsuffix; + + /// RegClass - This is the register class associated with this type. For + /// example, i8 -> GR8, i16 -> GR16, i32 -> GR32, i64 -> GR64. + RegisterClass RegClass = regclass; + + /// LoadNode - This is the load node associated with this type. For + /// example, i8 -> loadi8, i16 -> loadi16, i32 -> loadi32, i64 -> loadi64. + PatFrag LoadNode = loadnode; + + /// MemOperand - This is the memory operand associated with this type. For + /// example, i8 -> i8mem, i16 -> i16mem, i32 -> i32mem, i64 -> i64mem. + X86MemOperand MemOperand = memoperand; + + /// ImmEncoding - This is the encoding of an immediate of this type. For + /// example, i8 -> Imm8, i16 -> Imm16, i32 -> Imm32. Note that i64 -> Imm32 + /// since the immediate fields of i64 instructions is a 32-bit sign extended + /// value. + ImmType ImmEncoding = immkind; + + /// ImmOperand - This is the operand kind of an immediate of this type. For + /// example, i8 -> i8imm, i16 -> i16imm, i32 -> i32imm. Note that i64 -> + /// i64i32imm since the immediate fields of i64 instructions is a 32-bit sign + /// extended value. + Operand ImmOperand = immoperand; + + /// ImmOperator - This is the operator that should be used to match an + /// immediate of this kind in a pattern (e.g. imm, or i64immSExt32). + SDPatternOperator ImmOperator = immoperator; + + /// Imm8Operand - This is the operand kind to use for an imm8 of this type. + /// For example, i8 -> <invalid>, i16 -> i16i8imm, i32 -> i32i8imm. This is + /// only used for instructions that have a sign-extended imm8 field form. + Operand Imm8Operand = imm8operand; + + /// Imm8Operator - This is the operator that should be used to match an 8-bit + /// sign extended immediate of this kind in a pattern (e.g. imm16immSExt8). + SDPatternOperator Imm8Operator = imm8operator; + + /// HasOddOpcode - This bit is true if the instruction should have an odd (as + /// opposed to even) opcode. Operations on i8 are usually even, operations on + /// other datatypes are odd. + bit HasOddOpcode = hasOddOpcode; + + /// OpSize - Selects whether the instruction needs a 0x66 prefix based on + /// 16-bit vs 32-bit mode. i8/i64 set this to OpSizeFixed. i16 sets this + /// to Opsize16. i32 sets this to OpSize32. + OperandSize OpSize = opSize; + + /// HasREX_WPrefix - This bit is set to true if the instruction should have + /// the 0x40 REX prefix. This is set for i64 types. + bit HasREX_WPrefix = hasREX_WPrefix; +} + +def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">; + + +def Xi8 : X86TypeInfo<i8, "b", GR8, loadi8, i8mem, + Imm8, i8imm, imm8_su, i8imm, invalid_node, + 0, OpSizeFixed, 0>; +def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem, + Imm16, i16imm, imm16_su, i16i8imm, i16immSExt8_su, + 1, OpSize16, 0>; +def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem, + Imm32, i32imm, imm32_su, i32i8imm, i32immSExt8_su, + 1, OpSize32, 0>; +def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem, + Imm32S, i64i32imm, i64immSExt32, i64i8imm, i64immSExt8, + 1, OpSizeFixed, 1>; + +/// ITy - This instruction base class takes the type info for the instruction. +/// Using this, it: +/// 1. Concatenates together the instruction mnemonic with the appropriate +/// suffix letter, a tab, and the arguments. +/// 2. Infers whether the instruction should have a 0x66 prefix byte. +/// 3. Infers whether the instruction should have a 0x40 REX_W prefix. +/// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations) +/// or 1 (for i16,i32,i64 operations). +class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins, + string mnemonic, string args, list<dag> pattern, + InstrItinClass itin = IIC_BIN_NONMEM> + : I<{opcode{7}, opcode{6}, opcode{5}, opcode{4}, + opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode }, + f, outs, ins, + !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern, + itin> { + + // Infer instruction prefixes from type info. + let OpSize = typeinfo.OpSize; + let hasREX_WPrefix = typeinfo.HasREX_WPrefix; +} + +// BinOpRR - Instructions like "add reg, reg, reg". +class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + dag outlist, list<dag> pattern, InstrItinClass itin, + Format f = MRMDestReg> + : ITy<opcode, f, typeinfo, outlist, + (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>, + Sched<[WriteALU]>; + +// BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has +// just a EFLAGS as a result. +class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f = MRMDestReg> + : BinOpRR<opcode, mnemonic, typeinfo, (outs), + [(set EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))], + IIC_BIN_NONMEM, f>; + +// BinOpRR_RF - Instructions like "add reg, reg, reg", where the pattern has +// both a regclass and EFLAGS as a result. +class BinOpRR_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))], + IIC_BIN_NONMEM>; + +// BinOpRR_RFF - Instructions like "adc reg, reg, reg", where the pattern has +// both a regclass and EFLAGS as a result, and has EFLAGS as input. +class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2, + EFLAGS))], IIC_BIN_CARRY_NONMEM>; + +// BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding). +class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + InstrItinClass itin = IIC_BIN_NONMEM> + : ITy<opcode, MRMSrcReg, typeinfo, + (outs typeinfo.RegClass:$dst), + (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2), + mnemonic, "{$src2, $dst|$dst, $src2}", [], itin>, + Sched<[WriteALU]> { + // The disassembler should know about this, but not the asmparser. + let isCodeGenOnly = 1; + let ForceDisassemble = 1; + let hasSideEffects = 0; +} + +// BinOpRR_RDD_Rev - Instructions like "adc reg, reg, reg" (reversed encoding). +class BinOpRR_RFF_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo> + : BinOpRR_Rev<opcode, mnemonic, typeinfo, IIC_BIN_CARRY_NONMEM>; + +// BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding). +class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo> + : ITy<opcode, MRMSrcReg, typeinfo, (outs), + (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", [], IIC_BIN_NONMEM>, + Sched<[WriteALU]> { + // The disassembler should know about this, but not the asmparser. + let isCodeGenOnly = 1; + let ForceDisassemble = 1; + let hasSideEffects = 0; +} + +// BinOpRM - Instructions like "add reg, reg, [mem]". +class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + dag outlist, list<dag> pattern, + InstrItinClass itin = IIC_BIN_MEM> + : ITy<opcode, MRMSrcMem, typeinfo, outlist, + (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>, + Sched<[WriteALULd, ReadAfterLd]>; + +// BinOpRM_R - Instructions like "add reg, reg, [mem]". +class BinOpRM_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, + (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>; + +// BinOpRM_F - Instructions like "cmp reg, [mem]". +class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode> + : BinOpRM<opcode, mnemonic, typeinfo, (outs), + [(set EFLAGS, + (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>; + +// BinOpRM_RF - Instructions like "add reg, reg, [mem]". +class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>; + +// BinOpRM_RFF - Instructions like "adc reg, reg, [mem]". +class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2), + EFLAGS))], IIC_BIN_CARRY_MEM>; + +// BinOpRI - Instructions like "add reg, reg, imm". +class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Format f, dag outlist, list<dag> pattern, + InstrItinClass itin = IIC_BIN_NONMEM> + : ITy<opcode, f, typeinfo, outlist, + (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>, + Sched<[WriteALU]> { + let ImmT = typeinfo.ImmEncoding; +} + +// BinOpRI_F - Instructions like "cmp reg, imm". +class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpRI<opcode, mnemonic, typeinfo, f, (outs), + [(set EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>; + +// BinOpRI_RF - Instructions like "add reg, reg, imm". +class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>; +// BinOpRI_RFF - Instructions like "adc reg, reg, imm". +class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2, + EFLAGS))], IIC_BIN_CARRY_NONMEM>; + +// BinOpRI8 - Instructions like "add reg, reg, imm8". +class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Format f, dag outlist, list<dag> pattern, + InstrItinClass itin = IIC_BIN_NONMEM> + : ITy<opcode, f, typeinfo, outlist, + (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>, + Sched<[WriteALU]> { + let ImmT = Imm8; // Always 8-bit immediate. +} + +// BinOpRI8_F - Instructions like "cmp reg, imm8". +class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs), + [(set EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>; + +// BinOpRI8_RF - Instructions like "add reg, reg, imm8". +class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>; + +// BinOpRI8_RFF - Instructions like "adc reg, reg, imm8". +class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2, + EFLAGS))], IIC_BIN_CARRY_NONMEM>; + +// BinOpMR - Instructions like "add [mem], reg". +class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + list<dag> pattern, InstrItinClass itin = IIC_BIN_MEM> + : ITy<opcode, MRMDestMem, typeinfo, + (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src), + mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>, + Sched<[WriteALULd, WriteRMW]>; + +// BinOpMR_RMW - Instructions like "add [mem], reg". +class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpMR<opcode, mnemonic, typeinfo, + [(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst), + (implicit EFLAGS)]>; + +// BinOpMR_RMW_FF - Instructions like "adc [mem], reg". +class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpMR<opcode, mnemonic, typeinfo, + [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS), + addr:$dst), + (implicit EFLAGS)], IIC_BIN_CARRY_MEM>; + +// BinOpMR_F - Instructions like "cmp [mem], reg". +class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpMR<opcode, mnemonic, typeinfo, + [(set EFLAGS, (opnode (load addr:$dst), typeinfo.RegClass:$src))]>; + +// BinOpMI - Instructions like "add [mem], imm". +class BinOpMI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Format f, list<dag> pattern, + InstrItinClass itin = IIC_BIN_MEM> + : ITy<opcode, f, typeinfo, + (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src), + mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>, + Sched<[WriteALULd, WriteRMW]> { + let ImmT = typeinfo.ImmEncoding; +} + +// BinOpMI_RMW - Instructions like "add [mem], imm". +class BinOpMI_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpMI<opcode, mnemonic, typeinfo, f, + [(store (opnode (typeinfo.VT (load addr:$dst)), + typeinfo.ImmOperator:$src), addr:$dst), + (implicit EFLAGS)]>; +// BinOpMI_RMW_FF - Instructions like "adc [mem], imm". +class BinOpMI_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpMI<opcode, mnemonic, typeinfo, f, + [(store (opnode (typeinfo.VT (load addr:$dst)), + typeinfo.ImmOperator:$src, EFLAGS), addr:$dst), + (implicit EFLAGS)], IIC_BIN_CARRY_MEM>; + +// BinOpMI_F - Instructions like "cmp [mem], imm". +class BinOpMI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpMI<opcode, mnemonic, typeinfo, f, + [(set EFLAGS, (opnode (typeinfo.VT (load addr:$dst)), + typeinfo.ImmOperator:$src))]>; + +// BinOpMI8 - Instructions like "add [mem], imm8". +class BinOpMI8<string mnemonic, X86TypeInfo typeinfo, + Format f, list<dag> pattern, + InstrItinClass itin = IIC_BIN_MEM> + : ITy<0x82, f, typeinfo, + (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src), + mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>, + Sched<[WriteALULd, WriteRMW]> { + let ImmT = Imm8; // Always 8-bit immediate. +} + +// BinOpMI8_RMW - Instructions like "add [mem], imm8". +class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpMI8<mnemonic, typeinfo, f, + [(store (opnode (load addr:$dst), + typeinfo.Imm8Operator:$src), addr:$dst), + (implicit EFLAGS)]>; + +// BinOpMI8_RMW_FF - Instructions like "adc [mem], imm8". +class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpMI8<mnemonic, typeinfo, f, + [(store (opnode (load addr:$dst), + typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst), + (implicit EFLAGS)], IIC_BIN_CARRY_MEM>; + +// BinOpMI8_F - Instructions like "cmp [mem], imm8". +class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpMI8<mnemonic, typeinfo, f, + [(set EFLAGS, (opnode (load addr:$dst), + typeinfo.Imm8Operator:$src))]>; + +// BinOpAI - Instructions like "add %eax, %eax, imm", that imp-def EFLAGS. +class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Register areg, string operands, + InstrItinClass itin = IIC_BIN_NONMEM> + : ITy<opcode, RawFrm, typeinfo, + (outs), (ins typeinfo.ImmOperand:$src), + mnemonic, operands, [], itin>, Sched<[WriteALU]> { + let ImmT = typeinfo.ImmEncoding; + let Uses = [areg]; + let Defs = [areg, EFLAGS]; + let hasSideEffects = 0; +} + +// BinOpAI_RFF - Instructions like "adc %eax, %eax, imm", that implicitly define +// and use EFLAGS. +class BinOpAI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Register areg, string operands> + : BinOpAI<opcode, mnemonic, typeinfo, areg, operands, + IIC_BIN_CARRY_NONMEM> { + let Uses = [areg, EFLAGS]; +} + +// BinOpAI_F - Instructions like "cmp %eax, %eax, imm", that imp-def EFLAGS. +class BinOpAI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Register areg, string operands> + : BinOpAI<opcode, mnemonic, typeinfo, areg, operands> { + let Defs = [EFLAGS]; +} + +/// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is +/// defined with "(set GPR:$dst, EFLAGS, (...". +/// +/// It would be nice to get rid of the second and third argument here, but +/// tblgen can't handle dependent type references aggressively enough: PR8330 +multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, + string mnemonic, Format RegMRM, Format MemMRM, + SDNode opnodeflag, SDNode opnode, + bit CommutableRR, bit ConvertibleToThreeAddress> { + let Defs = [EFLAGS] in { + let Constraints = "$src1 = $dst" in { + let isCommutable = CommutableRR in { + def NAME#8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>; + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>; + def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>; + def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>; + } // isConvertibleToThreeAddress + } // isCommutable + + def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>; + def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>; + def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>; + def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>; + + def NAME#8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>; + def NAME#16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>; + def NAME#32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>; + def NAME#64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>; + + def NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>; + + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + // NOTE: These are order specific, we want the ri8 forms to be listed + // first so that they are slightly preferred to the ri forms. + def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>; + def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, opnodeflag, RegMRM>; + def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, opnodeflag, RegMRM>; + + def NAME#16ri : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>; + def NAME#32ri : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>; + def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>; + } + } // Constraints = "$src1 = $dst" + + def NAME#8mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>; + def NAME#16mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>; + + // NOTE: These are order specific, we want the mi8 forms to be listed + // first so that they are slightly preferred to the mi forms. + def NAME#16mi8 : BinOpMI8_RMW<mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi8 : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi8 : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>; + + def NAME#8mi : BinOpMI_RMW<0x80, mnemonic, Xi8 , opnode, MemMRM>; + def NAME#16mi : BinOpMI_RMW<0x80, mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi : BinOpMI_RMW<0x80, mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi32 : BinOpMI_RMW<0x80, mnemonic, Xi64, opnode, MemMRM>; + + // These are for the disassembler since 0x82 opcode behaves like 0x80, but + // not in 64-bit mode. + let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1, + hasSideEffects = 0 in { + let Constraints = "$src1 = $dst" in + def NAME#8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, null_frag, RegMRM>; + let mayLoad = 1, mayStore = 1 in + def NAME#8mi8 : BinOpMI8_RMW<mnemonic, Xi8, null_frag, MemMRM>; + } + } // Defs = [EFLAGS] + + def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|al, $src}">; + def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|ax, $src}">; + def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|eax, $src}">; + def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|rax, $src}">; +} + +/// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is +/// defined with "(set GPR:$dst, EFLAGS, (node LHS, RHS, EFLAGS))" like ADC and +/// SBB. +/// +/// It would be nice to get rid of the second and third argument here, but +/// tblgen can't handle dependent type references aggressively enough: PR8330 +multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, + string mnemonic, Format RegMRM, Format MemMRM, + SDNode opnode, bit CommutableRR, + bit ConvertibleToThreeAddress> { + let Uses = [EFLAGS], Defs = [EFLAGS] in { + let Constraints = "$src1 = $dst" in { + let isCommutable = CommutableRR in { + def NAME#8rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>; + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>; + } // isConvertibleToThreeAddress + } // isCommutable + + def NAME#8rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>; + def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>; + def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>; + def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>; + + def NAME#8rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>; + def NAME#16rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>; + def NAME#32rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>; + def NAME#64rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>; + + def NAME#8ri : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>; + + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + // NOTE: These are order specific, we want the ri8 forms to be listed + // first so that they are slightly preferred to the ri forms. + def NAME#16ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi16, opnode, RegMRM>; + def NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>; + def NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>; + + def NAME#16ri : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>; + def NAME#32ri : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>; + def NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>; + } + } // Constraints = "$src1 = $dst" + + def NAME#8mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi8 , opnode>; + def NAME#16mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi64, opnode>; + + // NOTE: These are order specific, we want the mi8 forms to be listed + // first so that they are slightly preferred to the mi forms. + def NAME#16mi8 : BinOpMI8_RMW_FF<mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi8 : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi8 : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>; + + def NAME#8mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi8 , opnode, MemMRM>; + def NAME#16mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi32 : BinOpMI_RMW_FF<0x80, mnemonic, Xi64, opnode, MemMRM>; + + // These are for the disassembler since 0x82 opcode behaves like 0x80, but + // not in 64-bit mode. + let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1, + hasSideEffects = 0 in { + let Constraints = "$src1 = $dst" in + def NAME#8ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi8, null_frag, RegMRM>; + let mayLoad = 1, mayStore = 1 in + def NAME#8mi8 : BinOpMI8_RMW_FF<mnemonic, Xi8, null_frag, MemMRM>; + } + } // Uses = [EFLAGS], Defs = [EFLAGS] + + def NAME#8i8 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|al, $src}">; + def NAME#16i16 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|ax, $src}">; + def NAME#32i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|eax, $src}">; + def NAME#64i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|rax, $src}">; +} + +/// ArithBinOp_F - This is an arithmetic binary operator where the pattern is +/// defined with "(set EFLAGS, (...". It would be really nice to find a way +/// to factor this with the other ArithBinOp_*. +/// +multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, + string mnemonic, Format RegMRM, Format MemMRM, + SDNode opnode, + bit CommutableRR, bit ConvertibleToThreeAddress> { + let Defs = [EFLAGS] in { + let isCommutable = CommutableRR in { + def NAME#8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>; + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>; + } + } // isCommutable + + def NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>; + def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>; + def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>; + def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>; + + def NAME#8rm : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>; + def NAME#16rm : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>; + def NAME#32rm : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>; + def NAME#64rm : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>; + + def NAME#8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>; + + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + // NOTE: These are order specific, we want the ri8 forms to be listed + // first so that they are slightly preferred to the ri forms. + def NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, opnode, RegMRM>; + def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, opnode, RegMRM>; + def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, opnode, RegMRM>; + + def NAME#16ri : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>; + def NAME#32ri : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>; + def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>; + } + + def NAME#8mr : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>; + def NAME#16mr : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32mr : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64mr : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>; + + // NOTE: These are order specific, we want the mi8 forms to be listed + // first so that they are slightly preferred to the mi forms. + def NAME#16mi8 : BinOpMI8_F<mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi8 : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi8 : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>; + + def NAME#8mi : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>; + def NAME#16mi : BinOpMI_F<0x80, mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi : BinOpMI_F<0x80, mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi32 : BinOpMI_F<0x80, mnemonic, Xi64, opnode, MemMRM>; + + // These are for the disassembler since 0x82 opcode behaves like 0x80, but + // not in 64-bit mode. + let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1, + hasSideEffects = 0 in { + def NAME#8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, null_frag, RegMRM>; + let mayLoad = 1 in + def NAME#8mi8 : BinOpMI8_F<mnemonic, Xi8, null_frag, MemMRM>; + } + } // Defs = [EFLAGS] + + def NAME#8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|al, $src}">; + def NAME#16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|ax, $src}">; + def NAME#32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|eax, $src}">; + def NAME#64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|rax, $src}">; +} + + +defm AND : ArithBinOp_RF<0x20, 0x22, 0x24, "and", MRM4r, MRM4m, + X86and_flag, and, 1, 0>; +defm OR : ArithBinOp_RF<0x08, 0x0A, 0x0C, "or", MRM1r, MRM1m, + X86or_flag, or, 1, 0>; +defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m, + X86xor_flag, xor, 1, 0>; +defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m, + X86add_flag, add, 1, 1>; +let isCompare = 1 in { +defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m, + X86sub_flag, sub, 0, 0>; +} + +// Arithmetic. +defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag, + 1, 0>; +defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag, + 0, 0>; + +let isCompare = 1 in { +defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>; +} + + +//===----------------------------------------------------------------------===// +// Semantically, test instructions are similar like AND, except they don't +// generate a result. From an encoding perspective, they are very different: +// they don't have all the usual imm8 and REV forms, and are encoded into a +// different space. +def X86testpat : PatFrag<(ops node:$lhs, node:$rhs), + (X86cmp (and_su node:$lhs, node:$rhs), 0)>; + +let isCompare = 1 in { + let Defs = [EFLAGS] in { + let isCommutable = 1 in { + def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , X86testpat>; + def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat>; + def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat>; + def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat>; + } // isCommutable + + def TEST8rm : BinOpRM_F<0x84, "test", Xi8 , X86testpat>; + def TEST16rm : BinOpRM_F<0x84, "test", Xi16, X86testpat>; + def TEST32rm : BinOpRM_F<0x84, "test", Xi32, X86testpat>; + def TEST64rm : BinOpRM_F<0x84, "test", Xi64, X86testpat>; + + def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>; + def TEST16ri : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>; + def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>; + def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>; + + def TEST8mi : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>; + def TEST16mi : BinOpMI_F<0xF6, "test", Xi16, X86testpat, MRM0m>; + def TEST32mi : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>; + def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>; + + // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the + // register class is constrained to GR8_NOREX. This pseudo is explicitly + // marked side-effect free, since it doesn't have an isel pattern like + // other test instructions. + let isPseudo = 1, hasSideEffects = 0 in + def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask), + "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>; + } // Defs = [EFLAGS] + + def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL, + "{$src, %al|al, $src}">; + def TEST16i16 : BinOpAI_F<0xA8, "test", Xi16, AX, + "{$src, %ax|ax, $src}">; + def TEST32i32 : BinOpAI_F<0xA8, "test", Xi32, EAX, + "{$src, %eax|eax, $src}">; + def TEST64i32 : BinOpAI_F<0xA8, "test", Xi64, RAX, + "{$src, %rax|rax, $src}">; +} // isCompare + +//===----------------------------------------------------------------------===// +// ANDN Instruction +// +multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop, + PatFrag ld_frag> { + def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))], + IIC_BIN_NONMEM>, Sched<[WriteALU]>; + def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, EFLAGS, + (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))], IIC_BIN_MEM>, + Sched<[WriteALULd, ReadAfterLd]>; +} + +let Predicates = [HasBMI], Defs = [EFLAGS] in { + defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8PS, VEX_4V; + defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8PS, VEX_4V, VEX_W; +} + +let Predicates = [HasBMI] in { + def : Pat<(and (not GR32:$src1), GR32:$src2), + (ANDN32rr GR32:$src1, GR32:$src2)>; + def : Pat<(and (not GR64:$src1), GR64:$src2), + (ANDN64rr GR64:$src1, GR64:$src2)>; + def : Pat<(and (not GR32:$src1), (loadi32 addr:$src2)), + (ANDN32rm GR32:$src1, addr:$src2)>; + def : Pat<(and (not GR64:$src1), (loadi64 addr:$src2)), + (ANDN64rm GR64:$src1, addr:$src2)>; +} + +//===----------------------------------------------------------------------===// +// MULX Instruction +// +multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop> { +let hasSideEffects = 0 in { + let isCommutable = 1 in + def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src), + !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), + [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul, WriteIMulH]>; + + let mayLoad = 1 in + def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src), + !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), + [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd, WriteIMulH]>; +} +} + +let Predicates = [HasBMI2] in { + let Uses = [EDX] in + defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem>; + let Uses = [RDX] in + defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem>, VEX_W; +} + +//===----------------------------------------------------------------------===// +// ADCX Instruction +// +let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS], + Constraints = "$src0 = $dst", AddedComplexity = 10 in { + let SchedRW = [WriteALU] in { + def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), + (ins GR32:$src0, GR32:$src), "adcx{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, + (X86adc_flag GR32:$src0, GR32:$src, EFLAGS))], + IIC_BIN_CARRY_NONMEM>, T8PD; + def ADCX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), + (ins GR64:$src0, GR64:$src), "adcx{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, + (X86adc_flag GR64:$src0, GR64:$src, EFLAGS))], + IIC_BIN_CARRY_NONMEM>, T8PD; + } // SchedRW + + let mayLoad = 1, SchedRW = [WriteALULd] in { + def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$src0, i32mem:$src), "adcx{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, + (X86adc_flag GR32:$src0, (loadi32 addr:$src), EFLAGS))], + IIC_BIN_CARRY_MEM>, T8PD; + + def ADCX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$src0, i64mem:$src), "adcx{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, + (X86adc_flag GR64:$src0, (loadi64 addr:$src), EFLAGS))], + IIC_BIN_CARRY_MEM>, T8PD; + } +} + +//===----------------------------------------------------------------------===// +// ADOX Instruction +// +let Predicates = [HasADX], hasSideEffects = 0, Defs = [EFLAGS], + Uses = [EFLAGS] in { + let SchedRW = [WriteALU] in { + def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS; + + def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS; + } // SchedRW + + let mayLoad = 1, SchedRW = [WriteALULd] in { + def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS; + + def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS; + } +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h new file mode 100644 index 0000000..787f15b --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h @@ -0,0 +1,183 @@ +//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file exposes functions that may be used with BuildMI from the +// MachineInstrBuilder.h file to handle X86'isms in a clean way. +// +// The BuildMem function may be used with the BuildMI function to add entire +// memory references in a single, typed, function call. X86 memory references +// can be very complex expressions (described in the README), so wrapping them +// up behind an easier to use interface makes sense. Descriptions of the +// functions are included below. +// +// For reference, the order of operands for memory references is: +// (Operand), Base, Scale, Index, Displacement. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H +#define LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H + +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" + +namespace llvm { + +/// X86AddressMode - This struct holds a generalized full x86 address mode. +/// The base register can be a frame index, which will eventually be replaced +/// with BP or SP and Disp being offsetted accordingly. The displacement may +/// also include the offset of a global value. +struct X86AddressMode { + enum { + RegBase, + FrameIndexBase + } BaseType; + + union { + unsigned Reg; + int FrameIndex; + } Base; + + unsigned Scale; + unsigned IndexReg; + int Disp; + const GlobalValue *GV; + unsigned GVOpFlags; + + X86AddressMode() + : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(nullptr), + GVOpFlags(0) { + Base.Reg = 0; + } + + + void getFullAddress(SmallVectorImpl<MachineOperand> &MO) { + assert(Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8); + + if (BaseType == X86AddressMode::RegBase) + MO.push_back(MachineOperand::CreateReg(Base.Reg, false, false, + false, false, false, 0, false)); + else { + assert(BaseType == X86AddressMode::FrameIndexBase); + MO.push_back(MachineOperand::CreateFI(Base.FrameIndex)); + } + + MO.push_back(MachineOperand::CreateImm(Scale)); + MO.push_back(MachineOperand::CreateReg(IndexReg, false, false, + false, false, false, 0, false)); + + if (GV) + MO.push_back(MachineOperand::CreateGA(GV, Disp, GVOpFlags)); + else + MO.push_back(MachineOperand::CreateImm(Disp)); + + MO.push_back(MachineOperand::CreateReg(0, false, false, + false, false, false, 0, false)); + } +}; + +/// addDirectMem - This function is used to add a direct memory reference to the +/// current instruction -- that is, a dereference of an address in a register, +/// with no scale, index or displacement. An example is: DWORD PTR [EAX]. +/// +static inline const MachineInstrBuilder & +addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) { + // Because memory references are always represented with five + // values, this adds: Reg, 1, NoReg, 0, NoReg to the instruction. + return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0).addReg(0); +} + + +static inline const MachineInstrBuilder & +addOffset(const MachineInstrBuilder &MIB, int Offset) { + return MIB.addImm(1).addReg(0).addImm(Offset).addReg(0); +} + +/// addRegOffset - This function is used to add a memory reference of the form +/// [Reg + Offset], i.e., one with no scale or index, but with a +/// displacement. An example is: DWORD PTR [EAX + 4]. +/// +static inline const MachineInstrBuilder & +addRegOffset(const MachineInstrBuilder &MIB, + unsigned Reg, bool isKill, int Offset) { + return addOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset); +} + +/// addRegReg - This function is used to add a memory reference of the form: +/// [Reg + Reg]. +static inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB, + unsigned Reg1, bool isKill1, + unsigned Reg2, bool isKill2) { + return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(1) + .addReg(Reg2, getKillRegState(isKill2)).addImm(0).addReg(0); +} + +static inline const MachineInstrBuilder & +addFullAddress(const MachineInstrBuilder &MIB, + const X86AddressMode &AM) { + assert(AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8); + + if (AM.BaseType == X86AddressMode::RegBase) + MIB.addReg(AM.Base.Reg); + else { + assert(AM.BaseType == X86AddressMode::FrameIndexBase); + MIB.addFrameIndex(AM.Base.FrameIndex); + } + + MIB.addImm(AM.Scale).addReg(AM.IndexReg); + if (AM.GV) + MIB.addGlobalAddress(AM.GV, AM.Disp, AM.GVOpFlags); + else + MIB.addImm(AM.Disp); + + return MIB.addReg(0); +} + +/// addFrameReference - This function is used to add a reference to the base of +/// an abstract object on the stack frame of the current function. This +/// reference has base register as the FrameIndex offset until it is resolved. +/// This allows a constant offset to be specified as well... +/// +static inline const MachineInstrBuilder & +addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) { + MachineInstr *MI = MIB; + MachineFunction &MF = *MI->getParent()->getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + const MCInstrDesc &MCID = MI->getDesc(); + unsigned Flags = 0; + if (MCID.mayLoad()) + Flags |= MachineMemOperand::MOLoad; + if (MCID.mayStore()) + Flags |= MachineMemOperand::MOStore; + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + return addOffset(MIB.addFrameIndex(FI), Offset) + .addMemOperand(MMO); +} + +/// addConstantPoolReference - This function is used to add a reference to the +/// base of a constant value spilled to the per-function constant pool. The +/// reference uses the abstract ConstantPoolIndex which is retained until +/// either machine code emission or assembly output. In PIC mode on x86-32, +/// the GlobalBaseReg parameter can be used to make this a +/// GlobalBaseReg-relative reference. +/// +static inline const MachineInstrBuilder & +addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI, + unsigned GlobalBaseReg, unsigned char OpFlags) { + //FIXME: factor this + return MIB.addReg(GlobalBaseReg).addImm(1).addReg(0) + .addConstantPoolIndex(CPI, 0, OpFlags).addReg(0); +} + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td new file mode 100644 index 0000000..c73c950 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td @@ -0,0 +1,112 @@ +//===-- X86InstrCMovSetCC.td - Conditional Move and SetCC --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 conditional move and set on condition +// instructions. +// +//===----------------------------------------------------------------------===// + + +// CMOV instructions. +multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> { + let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", + isCommutable = 1, SchedRW = [WriteALU] in { + def NAME#16rr + : I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), + !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"), + [(set GR16:$dst, + (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))], + IIC_CMOV16_RR>, TB, OpSize16; + def NAME#32rr + : I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), + !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"), + [(set GR32:$dst, + (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))], + IIC_CMOV32_RR>, TB, OpSize32; + def NAME#64rr + :RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), + !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"), + [(set GR64:$dst, + (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))], + IIC_CMOV32_RR>, TB; + } + + let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", + SchedRW = [WriteALULd, ReadAfterLd] in { + def NAME#16rm + : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), + !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"), + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + CondNode, EFLAGS))], IIC_CMOV16_RM>, + TB, OpSize16; + def NAME#32rm + : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), + !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"), + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + CondNode, EFLAGS))], IIC_CMOV32_RM>, + TB, OpSize32; + def NAME#64rm + :RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), + !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"), + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + CondNode, EFLAGS))], IIC_CMOV32_RM>, TB; + } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst" +} // end multiclass + + +// Conditional Moves. +defm CMOVO : CMOV<0x40, "cmovo" , X86_COND_O>; +defm CMOVNO : CMOV<0x41, "cmovno", X86_COND_NO>; +defm CMOVB : CMOV<0x42, "cmovb" , X86_COND_B>; +defm CMOVAE : CMOV<0x43, "cmovae", X86_COND_AE>; +defm CMOVE : CMOV<0x44, "cmove" , X86_COND_E>; +defm CMOVNE : CMOV<0x45, "cmovne", X86_COND_NE>; +defm CMOVBE : CMOV<0x46, "cmovbe", X86_COND_BE>; +defm CMOVA : CMOV<0x47, "cmova" , X86_COND_A>; +defm CMOVS : CMOV<0x48, "cmovs" , X86_COND_S>; +defm CMOVNS : CMOV<0x49, "cmovns", X86_COND_NS>; +defm CMOVP : CMOV<0x4A, "cmovp" , X86_COND_P>; +defm CMOVNP : CMOV<0x4B, "cmovnp", X86_COND_NP>; +defm CMOVL : CMOV<0x4C, "cmovl" , X86_COND_L>; +defm CMOVGE : CMOV<0x4D, "cmovge", X86_COND_GE>; +defm CMOVLE : CMOV<0x4E, "cmovle", X86_COND_LE>; +defm CMOVG : CMOV<0x4F, "cmovg" , X86_COND_G>; + + +// SetCC instructions. +multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> { + let Uses = [EFLAGS] in { + def r : I<opc, MRMXr, (outs GR8:$dst), (ins), + !strconcat(Mnemonic, "\t$dst"), + [(set GR8:$dst, (X86setcc OpNode, EFLAGS))], + IIC_SET_R>, TB, Sched<[WriteALU]>; + def m : I<opc, MRMXm, (outs), (ins i8mem:$dst), + !strconcat(Mnemonic, "\t$dst"), + [(store (X86setcc OpNode, EFLAGS), addr:$dst)], + IIC_SET_M>, TB, Sched<[WriteALU, WriteStore]>; + } // Uses = [EFLAGS] +} + +defm SETO : SETCC<0x90, "seto", X86_COND_O>; // is overflow bit set +defm SETNO : SETCC<0x91, "setno", X86_COND_NO>; // is overflow bit not set +defm SETB : SETCC<0x92, "setb", X86_COND_B>; // unsigned less than +defm SETAE : SETCC<0x93, "setae", X86_COND_AE>; // unsigned greater or equal +defm SETE : SETCC<0x94, "sete", X86_COND_E>; // equal to +defm SETNE : SETCC<0x95, "setne", X86_COND_NE>; // not equal to +defm SETBE : SETCC<0x96, "setbe", X86_COND_BE>; // unsigned less than or equal +defm SETA : SETCC<0x97, "seta", X86_COND_A>; // unsigned greater than +defm SETS : SETCC<0x98, "sets", X86_COND_S>; // is signed bit set +defm SETNS : SETCC<0x99, "setns", X86_COND_NS>; // is not signed +defm SETP : SETCC<0x9A, "setp", X86_COND_P>; // is parity bit set +defm SETNP : SETCC<0x9B, "setnp", X86_COND_NP>; // is parity bit not set +defm SETL : SETCC<0x9C, "setl", X86_COND_L>; // signed less than +defm SETGE : SETCC<0x9D, "setge", X86_COND_GE>; // signed greater or equal +defm SETLE : SETCC<0x9E, "setle", X86_COND_LE>; // signed less than or equal +defm SETG : SETCC<0x9F, "setg", X86_COND_G>; // signed greater than + diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td new file mode 100644 index 0000000..96a29ca --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td @@ -0,0 +1,1864 @@ +//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the various pseudo instructions used by the compiler, +// as well as Pat patterns used during instruction selection. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Pattern Matching Support + +def GetLo32XForm : SDNodeXForm<imm, [{ + // Transformation function: get the low 32 bits. + return getI32Imm((unsigned)N->getZExtValue(), SDLoc(N)); +}]>; + +def GetLo8XForm : SDNodeXForm<imm, [{ + // Transformation function: get the low 8 bits. + return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N)); +}]>; + + +//===----------------------------------------------------------------------===// +// Random Pseudo Instructions. + +// PIC base construction. This expands to code that looks like this: +// call $next_inst +// popl %destreg" +let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in + def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label), + "", []>; + + +// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into +// a stack adjustment and the codegen must know that they may modify the stack +// pointer before prolog-epilog rewriting occurs. +// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become +// sub / add which can clobber EFLAGS. +let Defs = [ESP, EFLAGS], Uses = [ESP] in { +def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), + "#ADJCALLSTACKDOWN", + []>, + Requires<[NotLP64]>; +def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), + "#ADJCALLSTACKUP", + [(X86callseq_end timm:$amt1, timm:$amt2)]>, + Requires<[NotLP64]>; +} +def : Pat<(X86callseq_start timm:$amt1), + (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>; + + +// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into +// a stack adjustment and the codegen must know that they may modify the stack +// pointer before prolog-epilog rewriting occurs. +// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become +// sub / add which can clobber EFLAGS. +let Defs = [RSP, EFLAGS], Uses = [RSP] in { +def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), + "#ADJCALLSTACKDOWN", + []>, + Requires<[IsLP64]>; +def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), + "#ADJCALLSTACKUP", + [(X86callseq_end timm:$amt1, timm:$amt2)]>, + Requires<[IsLP64]>; +} +def : Pat<(X86callseq_start timm:$amt1), + (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>; + + +// x86-64 va_start lowering magic. +let usesCustomInserter = 1, Defs = [EFLAGS] in { +def VASTART_SAVE_XMM_REGS : I<0, Pseudo, + (outs), + (ins GR8:$al, + i64imm:$regsavefi, i64imm:$offset, + variable_ops), + "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset", + [(X86vastart_save_xmm_regs GR8:$al, + imm:$regsavefi, + imm:$offset), + (implicit EFLAGS)]>; + +// The VAARG_64 pseudo-instruction takes the address of the va_list, +// and places the address of the next argument into a register. +let Defs = [EFLAGS] in +def VAARG_64 : I<0, Pseudo, + (outs GR64:$dst), + (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align), + "#VAARG_64 $dst, $ap, $size, $mode, $align", + [(set GR64:$dst, + (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)), + (implicit EFLAGS)]>; + +// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows +// targets. These calls are needed to probe the stack when allocating more than +// 4k bytes in one go. Touching the stack at 4K increments is necessary to +// ensure that the guard pages used by the OS virtual memory manager are +// allocated in correct sequence. +// The main point of having separate instruction are extra unmodelled effects +// (compared to ordinary calls) like stack pointer change. + +let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in + def WIN_ALLOCA : I<0, Pseudo, (outs), (ins), + "# dynamic stack allocation", + [(X86WinAlloca)]>; + +// When using segmented stacks these are lowered into instructions which first +// check if the current stacklet has enough free memory. If it does, memory is +// allocated by bumping the stack pointer. Otherwise memory is allocated from +// the heap. + +let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in +def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size), + "# variable sized alloca for segmented stacks", + [(set GR32:$dst, + (X86SegAlloca GR32:$size))]>, + Requires<[NotLP64]>; + +let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in +def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), + "# variable sized alloca for segmented stacks", + [(set GR64:$dst, + (X86SegAlloca GR64:$size))]>, + Requires<[In64BitMode]>; +} + +//===----------------------------------------------------------------------===// +// EH Pseudo Instructions +// +let SchedRW = [WriteSystem] in { +let isTerminator = 1, isReturn = 1, isBarrier = 1, + hasCtrlDep = 1, isCodeGenOnly = 1 in { +def EH_RETURN : I<0xC3, RawFrm, (outs), (ins GR32:$addr), + "ret\t#eh_return, addr: $addr", + [(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>; + +} + +let isTerminator = 1, isReturn = 1, isBarrier = 1, + hasCtrlDep = 1, isCodeGenOnly = 1 in { +def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr), + "ret\t#eh_return, addr: $addr", + [(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>; + +} + +let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, + isCodeGenOnly = 1, isReturn = 1 in { + def CLEANUPRET : I<0, Pseudo, (outs), (ins), "# CLEANUPRET", [(cleanupret)]>; + + // CATCHRET needs a custom inserter for SEH. + let usesCustomInserter = 1 in + def CATCHRET : I<0, Pseudo, (outs), (ins brtarget32:$dst, brtarget32:$from), + "# CATCHRET", + [(catchret bb:$dst, bb:$from)]>; +} + +let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1, + usesCustomInserter = 1 in +def CATCHPAD : I<0, Pseudo, (outs), (ins), "# CATCHPAD", [(catchpad)]>; + +// This instruction is responsible for re-establishing stack pointers after an +// exception has been caught and we are rejoining normal control flow in the +// parent function or funclet. It generally sets ESP and EBP, and optionally +// ESI. It is only needed for 32-bit WinEH, as the runtime restores CSRs for us +// elsewhere. +let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in +def EH_RESTORE : I<0, Pseudo, (outs), (ins), "# EH_RESTORE", []>; + +let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, + usesCustomInserter = 1 in { + def EH_SjLj_SetJmp32 : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf), + "#EH_SJLJ_SETJMP32", + [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>, + Requires<[Not64BitMode]>; + def EH_SjLj_SetJmp64 : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf), + "#EH_SJLJ_SETJMP64", + [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>, + Requires<[In64BitMode]>; + let isTerminator = 1 in { + def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf), + "#EH_SJLJ_LONGJMP32", + [(X86eh_sjlj_longjmp addr:$buf)]>, + Requires<[Not64BitMode]>; + def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf), + "#EH_SJLJ_LONGJMP64", + [(X86eh_sjlj_longjmp addr:$buf)]>, + Requires<[In64BitMode]>; + } +} +} // SchedRW + +let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in { + def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst), + "#EH_SjLj_Setup\t$dst", []>; +} + +//===----------------------------------------------------------------------===// +// Pseudo instructions used by unwind info. +// +let isPseudo = 1 in { + def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg), + "#SEH_PushReg $reg", []>; + def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst), + "#SEH_SaveReg $reg, $dst", []>; + def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst), + "#SEH_SaveXMM $reg, $dst", []>; + def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size), + "#SEH_StackAlloc $size", []>; + def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset), + "#SEH_SetFrame $reg, $offset", []>; + def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode), + "#SEH_PushFrame $mode", []>; + def SEH_EndPrologue : I<0, Pseudo, (outs), (ins), + "#SEH_EndPrologue", []>; + def SEH_Epilogue : I<0, Pseudo, (outs), (ins), + "#SEH_Epilogue", []>; +} + +//===----------------------------------------------------------------------===// +// Pseudo instructions used by segmented stacks. +// + +// This is lowered into a RET instruction by MCInstLower. We need +// this so that we don't have to have a MachineBasicBlock which ends +// with a RET and also has successors. +let isPseudo = 1 in { +def MORESTACK_RET: I<0, Pseudo, (outs), (ins), + "", []>; + +// This instruction is lowered to a RET followed by a MOV. The two +// instructions are not generated on a higher level since then the +// verifier sees a MachineBasicBlock ending with a non-terminator. +def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), + "", []>; +} + +//===----------------------------------------------------------------------===// +// Alias Instructions +//===----------------------------------------------------------------------===// + +// Alias instruction mapping movr0 to xor. +// FIXME: remove when we can teach regalloc that xor reg, reg is ok. +let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1, + isPseudo = 1 in +def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>; + +// Other widths can also make use of the 32-bit xor, which may have a smaller +// encoding and avoid partial register updates. +def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>; +def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>; +def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> { + let AddedComplexity = 20; +} + +let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode], + AddedComplexity = 1 in { + // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC, + // which only require 3 bytes compared to MOV32ri which requires 5. + let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in { + def MOV32r1 : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, 1)]>; + def MOV32r_1 : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, -1)]>; + } + + // MOV16ri is 4 bytes, so the instructions above are smaller. + def : Pat<(i16 1), (EXTRACT_SUBREG (MOV32r1), sub_16bit)>; + def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>; +} + +// Materialize i64 constant where top 32-bits are zero. This could theoretically +// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however +// that would make it more difficult to rematerialize. +let isReMaterializable = 1, isAsCheapAsAMove = 1, + isPseudo = 1, hasSideEffects = 0 in +def MOV32ri64 : I<0, Pseudo, (outs GR32:$dst), (ins i64i32imm:$src), "", []>; + +// This 64-bit pseudo-move can be used for both a 64-bit constant that is +// actually the zero-extension of a 32-bit constant and for labels in the +// x86-64 small code model. +def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [imm, X86Wrapper]>; + +let AddedComplexity = 1 in +def : Pat<(i64 mov64imm32:$src), + (SUBREG_TO_REG (i64 0), (MOV32ri64 mov64imm32:$src), sub_32bit)>; + +// Use sbb to materialize carry bit. +let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in { +// FIXME: These are pseudo ops that should be replaced with Pat<> patterns. +// However, Pat<> can't replicate the destination reg into the inputs of the +// result. +def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "", + [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "", + [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "", + [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +} // isCodeGenOnly + + +def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C16r)>; +def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; +def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C64r)>; + +def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C16r)>; +def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; +def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C64r)>; + +// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and +// will be eliminated and that the sbb can be extended up to a wider type. When +// this happens, it is great. However, if we are left with an 8-bit sbb and an +// and, we might as well just match it as a setb. +def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), + (SETBr)>; + +// (add OP, SETB) -> (adc OP, 0) +def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op), + (ADC8ri GR8:$op, 0)>; +def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op), + (ADC32ri8 GR32:$op, 0)>; +def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op), + (ADC64ri8 GR64:$op, 0)>; + +// (sub OP, SETB) -> (sbb OP, 0) +def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB8ri GR8:$op, 0)>; +def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB32ri8 GR32:$op, 0)>; +def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB64ri8 GR64:$op, 0)>; + +// (sub OP, SETCC_CARRY) -> (adc OP, 0) +def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC8ri GR8:$op, 0)>; +def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC32ri8 GR32:$op, 0)>; +def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC64ri8 GR64:$op, 0)>; + +//===----------------------------------------------------------------------===// +// String Pseudo Instructions +// +let SchedRW = [WriteMicrocoded] in { +let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in { +def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}", + [(X86rep_movs i8)], IIC_REP_MOVS>, REP, + Requires<[Not64BitMode]>; +def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}", + [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16, + Requires<[Not64BitMode]>; +def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}", + [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32, + Requires<[Not64BitMode]>; +} + +let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in { +def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}", + [(X86rep_movs i8)], IIC_REP_MOVS>, REP, + Requires<[In64BitMode]>; +def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}", + [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16, + Requires<[In64BitMode]>; +def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}", + [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32, + Requires<[In64BitMode]>; +def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}", + [(X86rep_movs i64)], IIC_REP_MOVS>, REP, + Requires<[In64BitMode]>; +} + +// FIXME: Should use "(X86rep_stos AL)" as the pattern. +let Defs = [ECX,EDI], isCodeGenOnly = 1 in { + let Uses = [AL,ECX,EDI] in + def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}", + [(X86rep_stos i8)], IIC_REP_STOS>, REP, + Requires<[Not64BitMode]>; + let Uses = [AX,ECX,EDI] in + def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}", + [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16, + Requires<[Not64BitMode]>; + let Uses = [EAX,ECX,EDI] in + def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}", + [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32, + Requires<[Not64BitMode]>; +} + +let Defs = [RCX,RDI], isCodeGenOnly = 1 in { + let Uses = [AL,RCX,RDI] in + def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}", + [(X86rep_stos i8)], IIC_REP_STOS>, REP, + Requires<[In64BitMode]>; + let Uses = [AX,RCX,RDI] in + def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}", + [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16, + Requires<[In64BitMode]>; + let Uses = [RAX,RCX,RDI] in + def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}", + [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32, + Requires<[In64BitMode]>; + + let Uses = [RAX,RCX,RDI] in + def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}", + [(X86rep_stos i64)], IIC_REP_STOS>, REP, + Requires<[In64BitMode]>; +} +} // SchedRW + +//===----------------------------------------------------------------------===// +// Thread Local Storage Instructions +// + +// ELF TLS Support +// All calls clobber the non-callee saved registers. ESP is marked as +// a use to prevent stack-pointer assignments that appear immediately +// before calls from potentially appearing dead. +let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7, + ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7, + MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], + Uses = [ESP] in { +def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), + "# TLS_addr32", + [(X86tlsaddr tls32addr:$sym)]>, + Requires<[Not64BitMode]>; +def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), + "# TLS_base_addr32", + [(X86tlsbaseaddr tls32baseaddr:$sym)]>, + Requires<[Not64BitMode]>; +} + +// All calls clobber the non-callee saved registers. RSP is marked as +// a use to prevent stack-pointer assignments that appear immediately +// before calls from potentially appearing dead. +let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, + FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7, + ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7, + MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], + Uses = [RSP] in { +def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), + "# TLS_addr64", + [(X86tlsaddr tls64addr:$sym)]>, + Requires<[In64BitMode]>; +def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), + "# TLS_base_addr64", + [(X86tlsbaseaddr tls64baseaddr:$sym)]>, + Requires<[In64BitMode]>; +} + +// Darwin TLS Support +// For i386, the address of the thunk is passed on the stack, on return the +// address of the variable is in %eax. %ecx is trashed during the function +// call. All other registers are preserved. +let Defs = [EAX, ECX, EFLAGS], + Uses = [ESP], + usesCustomInserter = 1 in +def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym), + "# TLSCall_32", + [(X86TLSCall addr:$sym)]>, + Requires<[Not64BitMode]>; + +// For x86_64, the address of the thunk is passed in %rdi, on return +// the address of the variable is in %rax. All other registers are preserved. +let Defs = [RAX, EFLAGS], + Uses = [RSP, RDI], + usesCustomInserter = 1 in +def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym), + "# TLSCall_64", + [(X86TLSCall addr:$sym)]>, + Requires<[In64BitMode]>; + + +//===----------------------------------------------------------------------===// +// Conditional Move Pseudo Instructions + +// CMOV* - Used to implement the SELECT DAG operation. Expanded after +// instruction selection into a branch sequence. +multiclass CMOVrr_PSEUDO<RegisterClass RC, ValueType VT> { + def CMOV#NAME : I<0, Pseudo, + (outs RC:$dst), (ins RC:$t, RC:$f, i8imm:$cond), + "#CMOV_"#NAME#" PSEUDO!", + [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, imm:$cond, + EFLAGS)))]>; +} + +let usesCustomInserter = 1, Uses = [EFLAGS] in { + // X86 doesn't have 8-bit conditional moves. Use a customInserter to + // emit control flow. An alternative to this is to mark i8 SELECT as Promote, + // however that requires promoting the operands, and can induce additional + // i8 register pressure. + defm _GR8 : CMOVrr_PSEUDO<GR8, i8>; + + let Predicates = [NoCMov] in { + defm _GR32 : CMOVrr_PSEUDO<GR32, i32>; + defm _GR16 : CMOVrr_PSEUDO<GR16, i16>; + } // Predicates = [NoCMov] + + // fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no + // SSE1/SSE2. + let Predicates = [FPStackf32] in + defm _RFP32 : CMOVrr_PSEUDO<RFP32, f32>; + + let Predicates = [FPStackf64] in + defm _RFP64 : CMOVrr_PSEUDO<RFP64, f64>; + + defm _RFP80 : CMOVrr_PSEUDO<RFP80, f80>; + + defm _FR32 : CMOVrr_PSEUDO<FR32, f32>; + defm _FR64 : CMOVrr_PSEUDO<FR64, f64>; + defm _FR128 : CMOVrr_PSEUDO<FR128, f128>; + defm _V4F32 : CMOVrr_PSEUDO<VR128, v4f32>; + defm _V2F64 : CMOVrr_PSEUDO<VR128, v2f64>; + defm _V2I64 : CMOVrr_PSEUDO<VR128, v2i64>; + defm _V8F32 : CMOVrr_PSEUDO<VR256, v8f32>; + defm _V4F64 : CMOVrr_PSEUDO<VR256, v4f64>; + defm _V4I64 : CMOVrr_PSEUDO<VR256, v4i64>; + defm _V8I64 : CMOVrr_PSEUDO<VR512, v8i64>; + defm _V8F64 : CMOVrr_PSEUDO<VR512, v8f64>; + defm _V16F32 : CMOVrr_PSEUDO<VR512, v16f32>; + defm _V8I1 : CMOVrr_PSEUDO<VK8, v8i1>; + defm _V16I1 : CMOVrr_PSEUDO<VK16, v16i1>; + defm _V32I1 : CMOVrr_PSEUDO<VK32, v32i1>; + defm _V64I1 : CMOVrr_PSEUDO<VK64, v64i1>; +} // usesCustomInserter = 1, Uses = [EFLAGS] + +//===----------------------------------------------------------------------===// +// Normal-Instructions-With-Lock-Prefix Pseudo Instructions +//===----------------------------------------------------------------------===// + +// FIXME: Use normal instructions and add lock prefix dynamically. + +// Memory barriers + +// TODO: Get this to fold the constant into the instruction. +let isCodeGenOnly = 1, Defs = [EFLAGS] in +def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero), + "or{l}\t{$zero, $dst|$dst, $zero}", [], + IIC_ALU_MEM>, Requires<[Not64BitMode]>, OpSize32, LOCK, + Sched<[WriteALULd, WriteRMW]>; + +let hasSideEffects = 1 in +def Int_MemBarrier : I<0, Pseudo, (outs), (ins), + "#MEMBARRIER", + [(X86MemBarrier)]>, Sched<[WriteLoad]>; + +// RegOpc corresponds to the mr version of the instruction +// ImmOpc corresponds to the mi version of the instruction +// ImmOpc8 corresponds to the mi8 version of the instruction +// ImmMod corresponds to the instruction format of the mi and mi8 versions +multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8, + Format ImmMod, string mnemonic> { +let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, + SchedRW = [WriteALULd, WriteRMW] in { + +def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 }, + MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), + !strconcat(mnemonic, "{b}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_NONMEM>, LOCK; +def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), + !strconcat(mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_NONMEM>, OpSize16, LOCK; +def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), + !strconcat(mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_NONMEM>, OpSize32, LOCK; +def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), + !strconcat(mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_NONMEM>, LOCK; + +def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 }, + ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2), + !strconcat(mnemonic, "{b}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, LOCK; + +def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2), + !strconcat(mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, OpSize16, LOCK; + +def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2), + !strconcat(mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, OpSize32, LOCK; + +def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2), + !strconcat(mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, LOCK; + +def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2), + !strconcat(mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, OpSize16, LOCK; +def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2), + !strconcat(mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, OpSize32, LOCK; +def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2), + !strconcat(mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, LOCK; + +} + +} + +defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">; +defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">; +defm LOCK_OR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">; +defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">; +defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">; + +// Optimized codegen when the non-memory output is not used. +multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form, + string mnemonic> { +let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, + SchedRW = [WriteALULd, WriteRMW] in { + +def NAME#8m : I<Opc8, Form, (outs), (ins i8mem :$dst), + !strconcat(mnemonic, "{b}\t$dst"), + [], IIC_UNARY_MEM>, LOCK; +def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst), + !strconcat(mnemonic, "{w}\t$dst"), + [], IIC_UNARY_MEM>, OpSize16, LOCK; +def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst), + !strconcat(mnemonic, "{l}\t$dst"), + [], IIC_UNARY_MEM>, OpSize32, LOCK; +def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst), + !strconcat(mnemonic, "{q}\t$dst"), + [], IIC_UNARY_MEM>, LOCK; +} +} + +defm LOCK_INC : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "inc">; +defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "dec">; + +// Atomic compare and swap. +multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic, + SDPatternOperator frag, X86MemOperand x86memop, + InstrItinClass itin> { +let isCodeGenOnly = 1 in { + def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr), + !strconcat(mnemonic, "\t$ptr"), + [(frag addr:$ptr)], itin>, TB, LOCK; +} +} + +multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form, + string mnemonic, SDPatternOperator frag, + InstrItinClass itin8, InstrItinClass itin> { +let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in { + let Defs = [AL, EFLAGS], Uses = [AL] in + def NAME#8 : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap), + !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK; + let Defs = [AX, EFLAGS], Uses = [AX] in + def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap), + !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize16, LOCK; + let Defs = [EAX, EFLAGS], Uses = [EAX] in + def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap), + !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, OpSize32, LOCK; + let Defs = [RAX, EFLAGS], Uses = [RAX] in + def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap), + !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK; +} +} + +let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX], + SchedRW = [WriteALULd, WriteRMW] in { +defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", + X86cas8, i64mem, + IIC_CMPX_LOCK_8B>; +} + +let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], + Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in { +defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b", + X86cas16, i128mem, + IIC_CMPX_LOCK_16B>, REX_W; +} + +defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg", + X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>; + +// Atomic exchange and add +multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic, + string frag, + InstrItinClass itin8, InstrItinClass itin> { + let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1, + SchedRW = [WriteALULd, WriteRMW] in { + def NAME#8 : I<opc8, MRMSrcMem, (outs GR8:$dst), + (ins GR8:$val, i8mem:$ptr), + !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"), + [(set GR8:$dst, + (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))], + itin8>; + def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst), + (ins GR16:$val, i16mem:$ptr), + !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"), + [(set + GR16:$dst, + (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))], + itin>, OpSize16; + def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$val, i32mem:$ptr), + !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"), + [(set + GR32:$dst, + (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))], + itin>, OpSize32; + def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$val, i64mem:$ptr), + !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"), + [(set + GR64:$dst, + (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))], + itin>; + } +} + +defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add", + IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>, + TB, LOCK; + +/* The following multiclass tries to make sure that in code like + * x.store (immediate op x.load(acquire), release) + * and + * x.store (register op x.load(acquire), release) + * an operation directly on memory is generated instead of wasting a register. + * It is not automatic as atomic_store/load are only lowered to MOV instructions + * extremely late to prevent them from being accidentally reordered in the backend + * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions) + */ +multiclass RELEASE_BINOP_MI<SDNode op> { + def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), + "#BINOP "#NAME#"8mi PSEUDO!", + [(atomic_store_8 addr:$dst, (op + (atomic_load_8 addr:$dst), (i8 imm:$src)))]>; + def NAME#8mr : I<0, Pseudo, (outs), (ins i8mem:$dst, GR8:$src), + "#BINOP "#NAME#"8mr PSEUDO!", + [(atomic_store_8 addr:$dst, (op + (atomic_load_8 addr:$dst), GR8:$src))]>; + // NAME#16 is not generated as 16-bit arithmetic instructions are considered + // costly and avoided as far as possible by this backend anyway + def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), + "#BINOP "#NAME#"32mi PSEUDO!", + [(atomic_store_32 addr:$dst, (op + (atomic_load_32 addr:$dst), (i32 imm:$src)))]>; + def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), + "#BINOP "#NAME#"32mr PSEUDO!", + [(atomic_store_32 addr:$dst, (op + (atomic_load_32 addr:$dst), GR32:$src))]>; + def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), + "#BINOP "#NAME#"64mi32 PSEUDO!", + [(atomic_store_64 addr:$dst, (op + (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>; + def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), + "#BINOP "#NAME#"64mr PSEUDO!", + [(atomic_store_64 addr:$dst, (op + (atomic_load_64 addr:$dst), GR64:$src))]>; +} +let Defs = [EFLAGS] in { + defm RELEASE_ADD : RELEASE_BINOP_MI<add>; + defm RELEASE_AND : RELEASE_BINOP_MI<and>; + defm RELEASE_OR : RELEASE_BINOP_MI<or>; + defm RELEASE_XOR : RELEASE_BINOP_MI<xor>; + // Note: we don't deal with sub, because substractions of constants are + // optimized into additions before this code can run. +} + +// Same as above, but for floating-point. +// FIXME: imm version. +// FIXME: Version that doesn't clobber $src, using AVX's VADDSS. +// FIXME: This could also handle SIMD operations with *ps and *pd instructions. +let usesCustomInserter = 1 in { +multiclass RELEASE_FP_BINOP_MI<SDNode op> { + def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src), + "#BINOP "#NAME#"32mr PSEUDO!", + [(atomic_store_32 addr:$dst, + (i32 (bitconvert (op + (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))), + FR32:$src))))]>, Requires<[HasSSE1]>; + def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src), + "#BINOP "#NAME#"64mr PSEUDO!", + [(atomic_store_64 addr:$dst, + (i64 (bitconvert (op + (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))), + FR64:$src))))]>, Requires<[HasSSE2]>; +} +defm RELEASE_FADD : RELEASE_FP_BINOP_MI<fadd>; +// FIXME: Add fsub, fmul, fdiv, ... +} + +multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> { + def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst), + "#UNOP "#NAME#"8m PSEUDO!", + [(atomic_store_8 addr:$dst, dag8)]>; + def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst), + "#UNOP "#NAME#"16m PSEUDO!", + [(atomic_store_16 addr:$dst, dag16)]>; + def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst), + "#UNOP "#NAME#"32m PSEUDO!", + [(atomic_store_32 addr:$dst, dag32)]>; + def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst), + "#UNOP "#NAME#"64m PSEUDO!", + [(atomic_store_64 addr:$dst, dag64)]>; +} + +let Defs = [EFLAGS] in { + defm RELEASE_INC : RELEASE_UNOP< + (add (atomic_load_8 addr:$dst), (i8 1)), + (add (atomic_load_16 addr:$dst), (i16 1)), + (add (atomic_load_32 addr:$dst), (i32 1)), + (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>; + defm RELEASE_DEC : RELEASE_UNOP< + (add (atomic_load_8 addr:$dst), (i8 -1)), + (add (atomic_load_16 addr:$dst), (i16 -1)), + (add (atomic_load_32 addr:$dst), (i32 -1)), + (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>; +} +/* +TODO: These don't work because the type inference of TableGen fails. +TODO: find a way to fix it. +let Defs = [EFLAGS] in { + defm RELEASE_NEG : RELEASE_UNOP< + (ineg (atomic_load_8 addr:$dst)), + (ineg (atomic_load_16 addr:$dst)), + (ineg (atomic_load_32 addr:$dst)), + (ineg (atomic_load_64 addr:$dst))>; +} +// NOT doesn't set flags. +defm RELEASE_NOT : RELEASE_UNOP< + (not (atomic_load_8 addr:$dst)), + (not (atomic_load_16 addr:$dst)), + (not (atomic_load_32 addr:$dst)), + (not (atomic_load_64 addr:$dst))>; +*/ + +def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), + "#RELEASE_MOV8mi PSEUDO!", + [(atomic_store_8 addr:$dst, (i8 imm:$src))]>; +def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src), + "#RELEASE_MOV16mi PSEUDO!", + [(atomic_store_16 addr:$dst, (i16 imm:$src))]>; +def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), + "#RELEASE_MOV32mi PSEUDO!", + [(atomic_store_32 addr:$dst, (i32 imm:$src))]>; +def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), + "#RELEASE_MOV64mi32 PSEUDO!", + [(atomic_store_64 addr:$dst, i64immSExt32:$src)]>; + +def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src), + "#RELEASE_MOV8mr PSEUDO!", + [(atomic_store_8 addr:$dst, GR8 :$src)]>; +def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src), + "#RELEASE_MOV16mr PSEUDO!", + [(atomic_store_16 addr:$dst, GR16:$src)]>; +def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), + "#RELEASE_MOV32mr PSEUDO!", + [(atomic_store_32 addr:$dst, GR32:$src)]>; +def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), + "#RELEASE_MOV64mr PSEUDO!", + [(atomic_store_64 addr:$dst, GR64:$src)]>; + +def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src), + "#ACQUIRE_MOV8rm PSEUDO!", + [(set GR8:$dst, (atomic_load_8 addr:$src))]>; +def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src), + "#ACQUIRE_MOV16rm PSEUDO!", + [(set GR16:$dst, (atomic_load_16 addr:$src))]>; +def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), + "#ACQUIRE_MOV32rm PSEUDO!", + [(set GR32:$dst, (atomic_load_32 addr:$src))]>; +def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), + "#ACQUIRE_MOV64rm PSEUDO!", + [(set GR64:$dst, (atomic_load_64 addr:$src))]>; + +//===----------------------------------------------------------------------===// +// DAG Pattern Matching Rules +//===----------------------------------------------------------------------===// + +// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable +def : Pat<(i32 (X86Wrapper tconstpool :$dst)), (MOV32ri tconstpool :$dst)>; +def : Pat<(i32 (X86Wrapper tjumptable :$dst)), (MOV32ri tjumptable :$dst)>; +def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>; +def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>; +def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>; +def : Pat<(i32 (X86Wrapper mcsym:$dst)), (MOV32ri mcsym:$dst)>; +def : Pat<(i32 (X86Wrapper tblockaddress:$dst)), (MOV32ri tblockaddress:$dst)>; + +def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)), + (ADD32ri GR32:$src1, tconstpool:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)), + (ADD32ri GR32:$src1, tjumptable:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)), + (ADD32ri GR32:$src1, tglobaladdr:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)), + (ADD32ri GR32:$src1, texternalsym:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper mcsym:$src2)), + (ADD32ri GR32:$src1, mcsym:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper tblockaddress:$src2)), + (ADD32ri GR32:$src1, tblockaddress:$src2)>; + +def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst), + (MOV32mi addr:$dst, tglobaladdr:$src)>; +def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst), + (MOV32mi addr:$dst, texternalsym:$src)>; +def : Pat<(store (i32 (X86Wrapper mcsym:$src)), addr:$dst), + (MOV32mi addr:$dst, mcsym:$src)>; +def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst), + (MOV32mi addr:$dst, tblockaddress:$src)>; + +// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small +// code model mode, should use 'movabs'. FIXME: This is really a hack, the +// 'movabs' predicate should handle this sort of thing. +def : Pat<(i64 (X86Wrapper tconstpool :$dst)), + (MOV64ri tconstpool :$dst)>, Requires<[FarData]>; +def : Pat<(i64 (X86Wrapper tjumptable :$dst)), + (MOV64ri tjumptable :$dst)>, Requires<[FarData]>; +def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), + (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>; +def : Pat<(i64 (X86Wrapper texternalsym:$dst)), + (MOV64ri texternalsym:$dst)>, Requires<[FarData]>; +def : Pat<(i64 (X86Wrapper mcsym:$dst)), + (MOV64ri mcsym:$dst)>, Requires<[FarData]>; +def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), + (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>; + +// In kernel code model, we can get the address of a label +// into a register with 'movq'. FIXME: This is a hack, the 'imm' predicate of +// the MOV64ri32 should accept these. +def : Pat<(i64 (X86Wrapper tconstpool :$dst)), + (MOV64ri32 tconstpool :$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper tjumptable :$dst)), + (MOV64ri32 tjumptable :$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), + (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper texternalsym:$dst)), + (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper mcsym:$dst)), + (MOV64ri32 mcsym:$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), + (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>; + +// If we have small model and -static mode, it is safe to store global addresses +// directly as immediates. FIXME: This is really a hack, the 'imm' predicate +// for MOV64mi32 should handle this sort of thing. +def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tconstpool:$src)>, + Requires<[NearData, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tjumptable:$src)>, + Requires<[NearData, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tglobaladdr:$src)>, + Requires<[NearData, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst), + (MOV64mi32 addr:$dst, texternalsym:$src)>, + Requires<[NearData, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper mcsym:$src)), addr:$dst), + (MOV64mi32 addr:$dst, mcsym:$src)>, + Requires<[NearData, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tblockaddress:$src)>, + Requires<[NearData, IsStatic]>; + +def : Pat<(i32 (X86RecoverFrameAlloc mcsym:$dst)), (MOV32ri mcsym:$dst)>; +def : Pat<(i64 (X86RecoverFrameAlloc mcsym:$dst)), (MOV64ri mcsym:$dst)>; + +// Calls + +// tls has some funny stuff here... +// This corresponds to movabs $foo@tpoff, %rax +def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)), + (MOV64ri32 tglobaltlsaddr :$dst)>; +// This corresponds to add $foo@tpoff, %rax +def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)), + (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>; + + +// Direct PC relative function call for small code model. 32-bit displacement +// sign extended to 64-bit. +def : Pat<(X86call (i64 tglobaladdr:$dst)), + (CALL64pcrel32 tglobaladdr:$dst)>; +def : Pat<(X86call (i64 texternalsym:$dst)), + (CALL64pcrel32 texternalsym:$dst)>; + +// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they +// can never use callee-saved registers. That is the purpose of the GR64_TC +// register classes. +// +// The only volatile register that is never used by the calling convention is +// %r11. This happens when calling a vararg function with 6 arguments. +// +// Match an X86tcret that uses less than 7 volatile registers. +def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off), + (X86tcret node:$ptr, node:$off), [{ + // X86tcret args: (*chain, ptr, imm, regs..., glue) + unsigned NumRegs = 0; + for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i) + if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6) + return false; + return true; +}]>; + +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[Not64BitMode]>; + +// FIXME: This is disabled for 32-bit PIC mode because the global base +// register which is part of the address mode may be assigned a +// callee-saved register. +def : Pat<(X86tcret (load addr:$dst), imm:$off), + (TCRETURNmi addr:$dst, imm:$off)>, + Requires<[Not64BitMode, IsNotPIC]>; + +def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off), + (TCRETURNdi tglobaladdr:$dst, imm:$off)>, + Requires<[NotLP64]>; + +def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off), + (TCRETURNdi texternalsym:$dst, imm:$off)>, + Requires<[NotLP64]>; + +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[In64BitMode]>; + +// Don't fold loads into X86tcret requiring more than 6 regs. +// There wouldn't be enough scratch registers for base+index. +def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off), + (TCRETURNmi64 addr:$dst, imm:$off)>, + Requires<[In64BitMode]>; + +def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off), + (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>, + Requires<[IsLP64]>; + +def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off), + (TCRETURNdi64 texternalsym:$dst, imm:$off)>, + Requires<[IsLP64]>; + +// Normal calls, with various flavors of addresses. +def : Pat<(X86call (i32 tglobaladdr:$dst)), + (CALLpcrel32 tglobaladdr:$dst)>; +def : Pat<(X86call (i32 texternalsym:$dst)), + (CALLpcrel32 texternalsym:$dst)>; +def : Pat<(X86call (i32 imm:$dst)), + (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>; + +// Comparisons. + +// TEST R,R is smaller than CMP R,0 +def : Pat<(X86cmp GR8:$src1, 0), + (TEST8rr GR8:$src1, GR8:$src1)>; +def : Pat<(X86cmp GR16:$src1, 0), + (TEST16rr GR16:$src1, GR16:$src1)>; +def : Pat<(X86cmp GR32:$src1, 0), + (TEST32rr GR32:$src1, GR32:$src1)>; +def : Pat<(X86cmp GR64:$src1, 0), + (TEST64rr GR64:$src1, GR64:$src1)>; + +// Conditional moves with folded loads with operands swapped and conditions +// inverted. +multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32, + Instruction Inst64> { + let Predicates = [HasCMov] in { + def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS), + (Inst16 GR16:$src2, addr:$src1)>; + def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS), + (Inst32 GR32:$src2, addr:$src1)>; + def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS), + (Inst64 GR64:$src2, addr:$src1)>; + } +} + +defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>; +defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>; +defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>; +defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>; +defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>; +defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>; +defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>; +defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>; +defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>; +defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>; +defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>; +defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>; +defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>; +defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>; +defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>; +defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>; + +// zextload bool -> zextload byte +def : Pat<(zextloadi8i1 addr:$src), (AND8ri (MOV8rm addr:$src), (i8 1))>; +def : Pat<(zextloadi16i1 addr:$src), (AND16ri8 (MOVZX16rm8 addr:$src), (i16 1))>; +def : Pat<(zextloadi32i1 addr:$src), (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1))>; +def : Pat<(zextloadi64i1 addr:$src), + (SUBREG_TO_REG (i64 0), + (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), sub_32bit)>; + +// extload bool -> extload byte +// When extloading from 16-bit and smaller memory locations into 64-bit +// registers, use zero-extending loads so that the entire 64-bit register is +// defined, avoiding partial-register updates. + +def : Pat<(extloadi8i1 addr:$src), (MOV8rm addr:$src)>; +def : Pat<(extloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>; +def : Pat<(extloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>; +def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>; +def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>; +def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>; + +// For other extloads, use subregs, since the high contents of the register are +// defined after an extload. +def : Pat<(extloadi64i1 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; +def : Pat<(extloadi64i8 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; +def : Pat<(extloadi64i16 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>; +def : Pat<(extloadi64i32 addr:$src), + (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>; + +// anyext. Define these to do an explicit zero-extend to +// avoid partial-register updates. +def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG + (MOVZX32rr8 GR8 :$src), sub_16bit)>; +def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>; + +// Except for i16 -> i32 since isel expect i16 ops to be promoted to i32. +def : Pat<(i32 (anyext GR16:$src)), + (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>; + +def : Pat<(i64 (anyext GR8 :$src)), + (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8 :$src), sub_32bit)>; +def : Pat<(i64 (anyext GR16:$src)), + (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>; +def : Pat<(i64 (anyext GR32:$src)), + (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; + + +// Any instruction that defines a 32-bit result leaves the high half of the +// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may +// be copying from a truncate. And x86's cmov doesn't do anything if the +// condition is false. But any other 32-bit operation will zero-extend +// up to 64 bits. +def def32 : PatLeaf<(i32 GR32:$src), [{ + return N->getOpcode() != ISD::TRUNCATE && + N->getOpcode() != TargetOpcode::EXTRACT_SUBREG && + N->getOpcode() != ISD::CopyFromReg && + N->getOpcode() != ISD::AssertSext && + N->getOpcode() != X86ISD::CMOV; +}]>; + +// In the case of a 32-bit def that is known to implicitly zero-extend, +// we can use a SUBREG_TO_REG. +def : Pat<(i64 (zext def32:$src)), + (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; + +//===----------------------------------------------------------------------===// +// Pattern match OR as ADD +//===----------------------------------------------------------------------===// + +// If safe, we prefer to pattern match OR as ADD at isel time. ADD can be +// 3-addressified into an LEA instruction to avoid copies. However, we also +// want to finally emit these instructions as an or at the end of the code +// generator to make the generated code easier to read. To do this, we select +// into "disjoint bits" pseudo ops. + +// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero. +def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{ + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1))) + return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue()); + + APInt KnownZero0, KnownOne0; + CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0); + APInt KnownZero1, KnownOne1; + CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0); + return (~KnownZero0 & ~KnownZero1) == 0; +}]>; + + +// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits. +// Try this before the selecting to OR. +let AddedComplexity = 5, SchedRW = [WriteALU] in { + +let isConvertibleToThreeAddress = 1, + Constraints = "$src1 = $dst", Defs = [EFLAGS] in { +let isCommutable = 1 in { +def ADD16rr_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), + "", // orw/addw REG, REG + [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>; +def ADD32rr_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), + "", // orl/addl REG, REG + [(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>; +def ADD64rr_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), + "", // orq/addq REG, REG + [(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>; +} // isCommutable + +// NOTE: These are order specific, we want the ri8 forms to be listed +// first so that they are slightly preferred to the ri forms. + +def ADD16ri8_DB : I<0, Pseudo, + (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), + "", // orw/addw REG, imm8 + [(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>; +def ADD16ri_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), + "", // orw/addw REG, imm + [(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>; + +def ADD32ri8_DB : I<0, Pseudo, + (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), + "", // orl/addl REG, imm8 + [(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>; +def ADD32ri_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), + "", // orl/addl REG, imm + [(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>; + + +def ADD64ri8_DB : I<0, Pseudo, + (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), + "", // orq/addq REG, imm8 + [(set GR64:$dst, (or_is_add GR64:$src1, + i64immSExt8:$src2))]>; +def ADD64ri32_DB : I<0, Pseudo, + (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), + "", // orq/addq REG, imm + [(set GR64:$dst, (or_is_add GR64:$src1, + i64immSExt32:$src2))]>; +} +} // AddedComplexity, SchedRW + + +//===----------------------------------------------------------------------===// +// Some peepholes +//===----------------------------------------------------------------------===// + +// Odd encoding trick: -128 fits into an 8-bit immediate field while +// +128 doesn't, so in this special case use a sub instead of an add. +def : Pat<(add GR16:$src1, 128), + (SUB16ri8 GR16:$src1, -128)>; +def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst), + (SUB16mi8 addr:$dst, -128)>; + +def : Pat<(add GR32:$src1, 128), + (SUB32ri8 GR32:$src1, -128)>; +def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst), + (SUB32mi8 addr:$dst, -128)>; + +def : Pat<(add GR64:$src1, 128), + (SUB64ri8 GR64:$src1, -128)>; +def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst), + (SUB64mi8 addr:$dst, -128)>; + +// The same trick applies for 32-bit immediate fields in 64-bit +// instructions. +def : Pat<(add GR64:$src1, 0x0000000080000000), + (SUB64ri32 GR64:$src1, 0xffffffff80000000)>; +def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst), + (SUB64mi32 addr:$dst, 0xffffffff80000000)>; + +// To avoid needing to materialize an immediate in a register, use a 32-bit and +// with implicit zero-extension instead of a 64-bit and if the immediate has at +// least 32 bits of leading zeros. If in addition the last 32 bits can be +// represented with a sign extension of a 8 bit constant, use that. +// This can also reduce instruction size by eliminating the need for the REX +// prefix. + +// AddedComplexity is needed to give priority over i64immSExt8 and i64immSExt32. +let AddedComplexity = 1 in { +def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm), + (SUBREG_TO_REG + (i64 0), + (AND32ri8 + (EXTRACT_SUBREG GR64:$src, sub_32bit), + (i32 (GetLo8XForm imm:$imm))), + sub_32bit)>; + +def : Pat<(and GR64:$src, i64immZExt32:$imm), + (SUBREG_TO_REG + (i64 0), + (AND32ri + (EXTRACT_SUBREG GR64:$src, sub_32bit), + (i32 (GetLo32XForm imm:$imm))), + sub_32bit)>; +} // AddedComplexity = 1 + + +// AddedComplexity is needed due to the increased complexity on the +// i64immZExt32SExt8 and i64immZExt32 patterns above. Applying this to all +// the MOVZX patterns keeps thems together in DAGIsel tables. +let AddedComplexity = 1 in { +// r & (2^16-1) ==> movz +def : Pat<(and GR32:$src1, 0xffff), + (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>; +// r & (2^8-1) ==> movz +def : Pat<(and GR32:$src1, 0xff), + (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1, + GR32_ABCD)), + sub_8bit))>, + Requires<[Not64BitMode]>; +// r & (2^8-1) ==> movz +def : Pat<(and GR16:$src1, 0xff), + (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG + (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)), + sub_16bit)>, + Requires<[Not64BitMode]>; + +// r & (2^32-1) ==> movz +def : Pat<(and GR64:$src, 0x00000000FFFFFFFF), + (SUBREG_TO_REG (i64 0), + (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)), + sub_32bit)>; +// r & (2^16-1) ==> movz +def : Pat<(and GR64:$src, 0xffff), + (SUBREG_TO_REG (i64 0), + (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))), + sub_32bit)>; +// r & (2^8-1) ==> movz +def : Pat<(and GR64:$src, 0xff), + (SUBREG_TO_REG (i64 0), + (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))), + sub_32bit)>; +// r & (2^8-1) ==> movz +def : Pat<(and GR32:$src1, 0xff), + (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>, + Requires<[In64BitMode]>; +// r & (2^8-1) ==> movz +def : Pat<(and GR16:$src1, 0xff), + (EXTRACT_SUBREG (MOVZX32rr8 (i8 + (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>, + Requires<[In64BitMode]>; +} // AddedComplexity = 1 + + +// sext_inreg patterns +def : Pat<(sext_inreg GR32:$src, i16), + (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>; +def : Pat<(sext_inreg GR32:$src, i8), + (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, + GR32_ABCD)), + sub_8bit))>, + Requires<[Not64BitMode]>; + +def : Pat<(sext_inreg GR16:$src, i8), + (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG + (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))), + sub_16bit)>, + Requires<[Not64BitMode]>; + +def : Pat<(sext_inreg GR64:$src, i32), + (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>; +def : Pat<(sext_inreg GR64:$src, i16), + (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>; +def : Pat<(sext_inreg GR64:$src, i8), + (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>; +def : Pat<(sext_inreg GR32:$src, i8), + (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>, + Requires<[In64BitMode]>; +def : Pat<(sext_inreg GR16:$src, i8), + (EXTRACT_SUBREG (MOVSX32rr8 + (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>, + Requires<[In64BitMode]>; + +// sext, sext_load, zext, zext_load +def: Pat<(i16 (sext GR8:$src)), + (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>; +def: Pat<(sextloadi16i8 addr:$src), + (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>; +def: Pat<(i16 (zext GR8:$src)), + (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>; +def: Pat<(zextloadi16i8 addr:$src), + (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>; + +// trunc patterns +def : Pat<(i16 (trunc GR32:$src)), + (EXTRACT_SUBREG GR32:$src, sub_16bit)>; +def : Pat<(i8 (trunc GR32:$src)), + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), + sub_8bit)>, + Requires<[Not64BitMode]>; +def : Pat<(i8 (trunc GR16:$src)), + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit)>, + Requires<[Not64BitMode]>; +def : Pat<(i32 (trunc GR64:$src)), + (EXTRACT_SUBREG GR64:$src, sub_32bit)>; +def : Pat<(i16 (trunc GR64:$src)), + (EXTRACT_SUBREG GR64:$src, sub_16bit)>; +def : Pat<(i8 (trunc GR64:$src)), + (EXTRACT_SUBREG GR64:$src, sub_8bit)>; +def : Pat<(i8 (trunc GR32:$src)), + (EXTRACT_SUBREG GR32:$src, sub_8bit)>, + Requires<[In64BitMode]>; +def : Pat<(i8 (trunc GR16:$src)), + (EXTRACT_SUBREG GR16:$src, sub_8bit)>, + Requires<[In64BitMode]>; + +// h-register tricks +def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))), + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)>, + Requires<[Not64BitMode]>; +def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))), + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), + sub_8bit_hi)>, + Requires<[Not64BitMode]>; +def : Pat<(srl GR16:$src, (i8 8)), + (EXTRACT_SUBREG + (MOVZX32rr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)), + sub_16bit)>, + Requires<[Not64BitMode]>; +def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))), + (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, + GR16_ABCD)), + sub_8bit_hi))>, + Requires<[Not64BitMode]>; +def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))), + (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, + GR16_ABCD)), + sub_8bit_hi))>, + Requires<[Not64BitMode]>; +def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)), + (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, + GR32_ABCD)), + sub_8bit_hi))>, + Requires<[Not64BitMode]>; +def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)), + (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, + GR32_ABCD)), + sub_8bit_hi))>, + Requires<[Not64BitMode]>; + +// h-register tricks. +// For now, be conservative on x86-64 and use an h-register extract only if the +// value is immediately zero-extended or stored, which are somewhat common +// cases. This uses a bunch of code to prevent a register requiring a REX prefix +// from being allocated in the same instruction as the h register, as there's +// currently no way to describe this requirement to the register allocator. + +// h-register extract and zero-extend. +def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)), + (SUBREG_TO_REG + (i64 0), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)), + sub_8bit_hi)), + sub_32bit)>; +def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)), + (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, + GR32_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(srl GR16:$src, (i8 8)), + (EXTRACT_SUBREG + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)), + sub_16bit)>, + Requires<[In64BitMode]>; +def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))), + (SUBREG_TO_REG + (i64 0), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)), + sub_32bit)>; +def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))), + (SUBREG_TO_REG + (i64 0), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)), + sub_32bit)>; + +// h-register extract and store. +def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst), + (MOV8mr_NOREX + addr:$dst, + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)), + sub_8bit_hi))>; +def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst), + (MOV8mr_NOREX + addr:$dst, + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst), + (MOV8mr_NOREX + addr:$dst, + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; + + +// (shl x, 1) ==> (add x, x) +// Note that if x is undef (immediate or otherwise), we could theoretically +// end up with the two uses of x getting different values, producing a result +// where the least significant bit is not 0. However, the probability of this +// happening is considered low enough that this is officially not a +// "real problem". +def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr GR8 :$src1, GR8 :$src1)>; +def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>; +def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>; +def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>; + +// Helper imms that check if a mask doesn't change significant shift bits. +def immShift32 : ImmLeaf<i8, [{ + return countTrailingOnes<uint64_t>(Imm) >= 5; +}]>; +def immShift64 : ImmLeaf<i8, [{ + return countTrailingOnes<uint64_t>(Imm) >= 6; +}]>; + +// Shift amount is implicitly masked. +multiclass MaskedShiftAmountPats<SDNode frag, string name> { + // (shift x (and y, 31)) ==> (shift x, y) + def : Pat<(frag GR8:$src1, (and CL, immShift32)), + (!cast<Instruction>(name # "8rCL") GR8:$src1)>; + def : Pat<(frag GR16:$src1, (and CL, immShift32)), + (!cast<Instruction>(name # "16rCL") GR16:$src1)>; + def : Pat<(frag GR32:$src1, (and CL, immShift32)), + (!cast<Instruction>(name # "32rCL") GR32:$src1)>; + def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst), + (!cast<Instruction>(name # "8mCL") addr:$dst)>; + def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst), + (!cast<Instruction>(name # "16mCL") addr:$dst)>; + def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst), + (!cast<Instruction>(name # "32mCL") addr:$dst)>; + + // (shift x (and y, 63)) ==> (shift x, y) + def : Pat<(frag GR64:$src1, (and CL, immShift64)), + (!cast<Instruction>(name # "64rCL") GR64:$src1)>; + def : Pat<(store (frag (loadi64 addr:$dst), (and CL, 63)), addr:$dst), + (!cast<Instruction>(name # "64mCL") addr:$dst)>; +} + +defm : MaskedShiftAmountPats<shl, "SHL">; +defm : MaskedShiftAmountPats<srl, "SHR">; +defm : MaskedShiftAmountPats<sra, "SAR">; +defm : MaskedShiftAmountPats<rotl, "ROL">; +defm : MaskedShiftAmountPats<rotr, "ROR">; + +// (anyext (setcc_carry)) -> (setcc_carry) +def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C16r)>; +def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; +def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; + + + + +//===----------------------------------------------------------------------===// +// EFLAGS-defining Patterns +//===----------------------------------------------------------------------===// + +// add reg, reg +def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>; +def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>; + +// add reg, mem +def : Pat<(add GR8:$src1, (loadi8 addr:$src2)), + (ADD8rm GR8:$src1, addr:$src2)>; +def : Pat<(add GR16:$src1, (loadi16 addr:$src2)), + (ADD16rm GR16:$src1, addr:$src2)>; +def : Pat<(add GR32:$src1, (loadi32 addr:$src2)), + (ADD32rm GR32:$src1, addr:$src2)>; + +// add reg, imm +def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri GR8:$src1 , imm:$src2)>; +def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>; +def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>; +def : Pat<(add GR16:$src1, i16immSExt8:$src2), + (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(add GR32:$src1, i32immSExt8:$src2), + (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>; + +// sub reg, reg +def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>; +def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>; + +// sub reg, mem +def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)), + (SUB8rm GR8:$src1, addr:$src2)>; +def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)), + (SUB16rm GR16:$src1, addr:$src2)>; +def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)), + (SUB32rm GR32:$src1, addr:$src2)>; + +// sub reg, imm +def : Pat<(sub GR8:$src1, imm:$src2), + (SUB8ri GR8:$src1, imm:$src2)>; +def : Pat<(sub GR16:$src1, imm:$src2), + (SUB16ri GR16:$src1, imm:$src2)>; +def : Pat<(sub GR32:$src1, imm:$src2), + (SUB32ri GR32:$src1, imm:$src2)>; +def : Pat<(sub GR16:$src1, i16immSExt8:$src2), + (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(sub GR32:$src1, i32immSExt8:$src2), + (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>; + +// sub 0, reg +def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r GR8 :$src)>; +def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>; +def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>; +def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>; + +// mul reg, reg +def : Pat<(mul GR16:$src1, GR16:$src2), + (IMUL16rr GR16:$src1, GR16:$src2)>; +def : Pat<(mul GR32:$src1, GR32:$src2), + (IMUL32rr GR32:$src1, GR32:$src2)>; + +// mul reg, mem +def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)), + (IMUL16rm GR16:$src1, addr:$src2)>; +def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)), + (IMUL32rm GR32:$src1, addr:$src2)>; + +// mul reg, imm +def : Pat<(mul GR16:$src1, imm:$src2), + (IMUL16rri GR16:$src1, imm:$src2)>; +def : Pat<(mul GR32:$src1, imm:$src2), + (IMUL32rri GR32:$src1, imm:$src2)>; +def : Pat<(mul GR16:$src1, i16immSExt8:$src2), + (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(mul GR32:$src1, i32immSExt8:$src2), + (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>; + +// reg = mul mem, imm +def : Pat<(mul (loadi16 addr:$src1), imm:$src2), + (IMUL16rmi addr:$src1, imm:$src2)>; +def : Pat<(mul (loadi32 addr:$src1), imm:$src2), + (IMUL32rmi addr:$src1, imm:$src2)>; +def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2), + (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>; +def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2), + (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>; + +// Patterns for nodes that do not produce flags, for instructions that do. + +// addition +def : Pat<(add GR64:$src1, GR64:$src2), + (ADD64rr GR64:$src1, GR64:$src2)>; +def : Pat<(add GR64:$src1, i64immSExt8:$src2), + (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(add GR64:$src1, i64immSExt32:$src2), + (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>; +def : Pat<(add GR64:$src1, (loadi64 addr:$src2)), + (ADD64rm GR64:$src1, addr:$src2)>; + +// subtraction +def : Pat<(sub GR64:$src1, GR64:$src2), + (SUB64rr GR64:$src1, GR64:$src2)>; +def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)), + (SUB64rm GR64:$src1, addr:$src2)>; +def : Pat<(sub GR64:$src1, i64immSExt8:$src2), + (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(sub GR64:$src1, i64immSExt32:$src2), + (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// Multiply +def : Pat<(mul GR64:$src1, GR64:$src2), + (IMUL64rr GR64:$src1, GR64:$src2)>; +def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)), + (IMUL64rm GR64:$src1, addr:$src2)>; +def : Pat<(mul GR64:$src1, i64immSExt8:$src2), + (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(mul GR64:$src1, i64immSExt32:$src2), + (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>; +def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2), + (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>; +def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2), + (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>; + +// Increment/Decrement reg. +// Do not make INC/DEC if it is slow +let Predicates = [NotSlowIncDec] in { + def : Pat<(add GR8:$src, 1), (INC8r GR8:$src)>; + def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>; + def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>; + def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>; + def : Pat<(add GR8:$src, -1), (DEC8r GR8:$src)>; + def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>; + def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>; + def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>; +} + +// or reg/reg. +def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>; +def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>; +def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>; + +// or reg/mem +def : Pat<(or GR8:$src1, (loadi8 addr:$src2)), + (OR8rm GR8:$src1, addr:$src2)>; +def : Pat<(or GR16:$src1, (loadi16 addr:$src2)), + (OR16rm GR16:$src1, addr:$src2)>; +def : Pat<(or GR32:$src1, (loadi32 addr:$src2)), + (OR32rm GR32:$src1, addr:$src2)>; +def : Pat<(or GR64:$src1, (loadi64 addr:$src2)), + (OR64rm GR64:$src1, addr:$src2)>; + +// or reg/imm +def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri GR8 :$src1, imm:$src2)>; +def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>; +def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>; +def : Pat<(or GR16:$src1, i16immSExt8:$src2), + (OR16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(or GR32:$src1, i32immSExt8:$src2), + (OR32ri8 GR32:$src1, i32immSExt8:$src2)>; +def : Pat<(or GR64:$src1, i64immSExt8:$src2), + (OR64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(or GR64:$src1, i64immSExt32:$src2), + (OR64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// xor reg/reg +def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>; +def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>; +def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>; + +// xor reg/mem +def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)), + (XOR8rm GR8:$src1, addr:$src2)>; +def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)), + (XOR16rm GR16:$src1, addr:$src2)>; +def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)), + (XOR32rm GR32:$src1, addr:$src2)>; +def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)), + (XOR64rm GR64:$src1, addr:$src2)>; + +// xor reg/imm +def : Pat<(xor GR8:$src1, imm:$src2), + (XOR8ri GR8:$src1, imm:$src2)>; +def : Pat<(xor GR16:$src1, imm:$src2), + (XOR16ri GR16:$src1, imm:$src2)>; +def : Pat<(xor GR32:$src1, imm:$src2), + (XOR32ri GR32:$src1, imm:$src2)>; +def : Pat<(xor GR16:$src1, i16immSExt8:$src2), + (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(xor GR32:$src1, i32immSExt8:$src2), + (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>; +def : Pat<(xor GR64:$src1, i64immSExt8:$src2), + (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(xor GR64:$src1, i64immSExt32:$src2), + (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// and reg/reg +def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>; +def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>; +def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>; + +// and reg/mem +def : Pat<(and GR8:$src1, (loadi8 addr:$src2)), + (AND8rm GR8:$src1, addr:$src2)>; +def : Pat<(and GR16:$src1, (loadi16 addr:$src2)), + (AND16rm GR16:$src1, addr:$src2)>; +def : Pat<(and GR32:$src1, (loadi32 addr:$src2)), + (AND32rm GR32:$src1, addr:$src2)>; +def : Pat<(and GR64:$src1, (loadi64 addr:$src2)), + (AND64rm GR64:$src1, addr:$src2)>; + +// and reg/imm +def : Pat<(and GR8:$src1, imm:$src2), + (AND8ri GR8:$src1, imm:$src2)>; +def : Pat<(and GR16:$src1, imm:$src2), + (AND16ri GR16:$src1, imm:$src2)>; +def : Pat<(and GR32:$src1, imm:$src2), + (AND32ri GR32:$src1, imm:$src2)>; +def : Pat<(and GR16:$src1, i16immSExt8:$src2), + (AND16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(and GR32:$src1, i32immSExt8:$src2), + (AND32ri8 GR32:$src1, i32immSExt8:$src2)>; +def : Pat<(and GR64:$src1, i64immSExt8:$src2), + (AND64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(and GR64:$src1, i64immSExt32:$src2), + (AND64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// Bit scan instruction patterns to match explicit zero-undef behavior. +def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>; +def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>; +def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>; +def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>; +def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>; +def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>; + +// When HasMOVBE is enabled it is possible to get a non-legalized +// register-register 16 bit bswap. This maps it to a ROL instruction. +let Predicates = [HasMOVBE] in { + def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>; +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td new file mode 100644 index 0000000..8c351a5 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td @@ -0,0 +1,329 @@ +//===-- X86InstrControl.td - Control Flow Instructions -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 jump, return, call, and related instructions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Control Flow Instructions. +// + +// Return instructions. +// +// The X86retflag return instructions are variadic because we may add ST0 and +// ST1 arguments when returning values on the x87 stack. +let isTerminator = 1, isReturn = 1, isBarrier = 1, + hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in { + def RETL : I <0xC3, RawFrm, (outs), (ins variable_ops), + "ret{l}", [(X86retflag 0)], IIC_RET>, OpSize32, + Requires<[Not64BitMode]>; + def RETQ : I <0xC3, RawFrm, (outs), (ins variable_ops), + "ret{q}", [(X86retflag 0)], IIC_RET>, OpSize32, + Requires<[In64BitMode]>; + def RETW : I <0xC3, RawFrm, (outs), (ins), + "ret{w}", + [], IIC_RET>, OpSize16; + def RETIL : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops), + "ret{l}\t$amt", + [(X86retflag timm:$amt)], IIC_RET_IMM>, OpSize32, + Requires<[Not64BitMode]>; + def RETIQ : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops), + "ret{q}\t$amt", + [(X86retflag timm:$amt)], IIC_RET_IMM>, OpSize32, + Requires<[In64BitMode]>; + def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt), + "ret{w}\t$amt", + [], IIC_RET_IMM>, OpSize16; + def LRETL : I <0xCB, RawFrm, (outs), (ins), + "{l}ret{l|f}", [], IIC_RET>, OpSize32; + def LRETQ : RI <0xCB, RawFrm, (outs), (ins), + "{l}ret{|f}q", [], IIC_RET>, Requires<[In64BitMode]>; + def LRETW : I <0xCB, RawFrm, (outs), (ins), + "{l}ret{w|f}", [], IIC_RET>, OpSize16; + def LRETIL : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt), + "{l}ret{l|f}\t$amt", [], IIC_RET>, OpSize32; + def LRETIQ : RIi16<0xCA, RawFrm, (outs), (ins i16imm:$amt), + "{l}ret{|f}q\t$amt", [], IIC_RET>, Requires<[In64BitMode]>; + def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt), + "{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize16; + + // The machine return from interrupt instruction, but sometimes we need to + // perform a post-epilogue stack adjustment. Codegen emits the pseudo form + // which expands to include an SP adjustment if necessary. + def IRET16 : I <0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, + OpSize16; + def IRET32 : I <0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], + IIC_IRET>, OpSize32; + def IRET64 : RI <0xcf, RawFrm, (outs), (ins), "iretq", [], + IIC_IRET>, Requires<[In64BitMode]>; + let isCodeGenOnly = 1 in + def IRET : PseudoI<(outs), (ins i16imm:$adj), [(X86iret timm:$adj)]>; + +} + +// Unconditional branches. +let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in { + def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst), + "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>; + let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { + def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget16:$dst), + "jmp\t$dst", [], IIC_JMP_REL>, OpSize16; + def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget32:$dst), + "jmp\t$dst", [], IIC_JMP_REL>, OpSize32; + } +} + +// Conditional Branches. +let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in { + multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> { + def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, + [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>; + let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { + def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget16:$dst), asm, + [], IIC_Jcc>, OpSize16, TB; + def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget32:$dst), asm, + [], IIC_Jcc>, TB, OpSize32; + } + } +} + +defm JO : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>; +defm JNO : ICBr<0x71, 0x81, "jno\t$dst", X86_COND_NO>; +defm JB : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>; +defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>; +defm JE : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>; +defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>; +defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>; +defm JA : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>; +defm JS : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>; +defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>; +defm JP : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>; +defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>; +defm JL : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>; +defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>; +defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>; +defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>; + +// jcx/jecx/jrcx instructions. +let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in { + // These are the 32-bit versions of this instruction for the asmparser. In + // 32-bit mode, the address size prefix is jcxz and the unprefixed version is + // jecxz. + let Uses = [CX] in + def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), + "jcxz\t$dst", [], IIC_JCXZ>, AdSize16, + Requires<[Not64BitMode]>; + let Uses = [ECX] in + def JECXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), + "jecxz\t$dst", [], IIC_JCXZ>, AdSize32; + + let Uses = [RCX] in + def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), + "jrcxz\t$dst", [], IIC_JCXZ>, AdSize64, + Requires<[In64BitMode]>; +} + +// Indirect branches +let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + def JMP16r : I<0xFF, MRM4r, (outs), (ins GR16:$dst), "jmp{w}\t{*}$dst", + [(brind GR16:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>, + OpSize16, Sched<[WriteJump]>; + def JMP16m : I<0xFF, MRM4m, (outs), (ins i16mem:$dst), "jmp{w}\t{*}$dst", + [(brind (loadi16 addr:$dst))], IIC_JMP_MEM>, + Requires<[Not64BitMode]>, OpSize16, Sched<[WriteJumpLd]>; + + def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst", + [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>, + OpSize32, Sched<[WriteJump]>; + def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst", + [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>, + Requires<[Not64BitMode]>, OpSize32, Sched<[WriteJumpLd]>; + + def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst", + [(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>, + Sched<[WriteJump]>; + def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst", + [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>, + Requires<[In64BitMode]>, Sched<[WriteJumpLd]>; + + let Predicates = [Not64BitMode] in { + def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs), + (ins i16imm:$off, i16imm:$seg), + "ljmp{w}\t$seg, $off", [], + IIC_JMP_FAR_PTR>, OpSize16, Sched<[WriteJump]>; + def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs), + (ins i32imm:$off, i16imm:$seg), + "ljmp{l}\t$seg, $off", [], + IIC_JMP_FAR_PTR>, OpSize32, Sched<[WriteJump]>; + } + def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst), + "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>, + Sched<[WriteJump]>; + + def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst), + "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize16, + Sched<[WriteJumpLd]>; + def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst), + "ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize32, + Sched<[WriteJumpLd]>; +} + + +// Loop instructions +let SchedRW = [WriteJump] in { +def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>; +def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>; +def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>; +} + +//===----------------------------------------------------------------------===// +// Call Instructions... +// +let isCall = 1 in + // All calls clobber the non-callee saved registers. ESP is marked as + // a use to prevent stack-pointer assignments that appear immediately + // before calls from potentially appearing dead. Uses for argument + // registers are added manually. + let Uses = [ESP] in { + def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm, + (outs), (ins i32imm_pcrel:$dst), + "call{l}\t$dst", [], IIC_CALL_RI>, OpSize32, + Requires<[Not64BitMode]>, Sched<[WriteJump]>; + let hasSideEffects = 0 in + def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm, + (outs), (ins i16imm_pcrel:$dst), + "call{w}\t$dst", [], IIC_CALL_RI>, OpSize16, + Sched<[WriteJump]>; + def CALL16r : I<0xFF, MRM2r, (outs), (ins GR16:$dst), + "call{w}\t{*}$dst", [(X86call GR16:$dst)], IIC_CALL_RI>, + OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>; + def CALL16m : I<0xFF, MRM2m, (outs), (ins i16mem:$dst), + "call{w}\t{*}$dst", [(X86call (loadi16 addr:$dst))], + IIC_CALL_MEM>, OpSize16, + Requires<[Not64BitMode,FavorMemIndirectCall]>, + Sched<[WriteJumpLd]>; + def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst), + "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>, + OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>; + def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst), + "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))], + IIC_CALL_MEM>, OpSize32, + Requires<[Not64BitMode,FavorMemIndirectCall]>, + Sched<[WriteJumpLd]>; + + let Predicates = [Not64BitMode] in { + def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs), + (ins i16imm:$off, i16imm:$seg), + "lcall{w}\t$seg, $off", [], + IIC_CALL_FAR_PTR>, OpSize16, Sched<[WriteJump]>; + def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs), + (ins i32imm:$off, i16imm:$seg), + "lcall{l}\t$seg, $off", [], + IIC_CALL_FAR_PTR>, OpSize32, Sched<[WriteJump]>; + } + + def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst), + "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize16, + Sched<[WriteJumpLd]>; + def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst), + "lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize32, + Sched<[WriteJumpLd]>; + } + + +// Tail call stuff. + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, + isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in + let Uses = [ESP] in { + def TCRETURNdi : PseudoI<(outs), + (ins i32imm_pcrel:$dst, i32imm:$offset), []>; + def TCRETURNri : PseudoI<(outs), + (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>; + let mayLoad = 1 in + def TCRETURNmi : PseudoI<(outs), + (ins i32mem_TC:$dst, i32imm:$offset), []>; + + // FIXME: The should be pseudo instructions that are lowered when going to + // mcinst. + def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs), + (ins i32imm_pcrel:$dst), + "jmp\t$dst", + [], IIC_JMP_REL>; + def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), + "", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead. + let mayLoad = 1 in + def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst), + "jmp{l}\t{*}$dst", [], IIC_JMP_MEM>; +} + + +//===----------------------------------------------------------------------===// +// Call Instructions... +// + +// RSP is marked as a use to prevent stack-pointer assignments that appear +// immediately before calls from potentially appearing dead. Uses for argument +// registers are added manually. +let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in { + // NOTE: this pattern doesn't match "X86call imm", because we do not know + // that the offset between an arbitrary immediate and the call will fit in + // the 32-bit pcrel field that we have. + def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm, + (outs), (ins i64i32imm_pcrel:$dst), + "call{q}\t$dst", [], IIC_CALL_RI>, OpSize32, + Requires<[In64BitMode]>; + def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst), + "call{q}\t{*}$dst", [(X86call GR64:$dst)], + IIC_CALL_RI>, + Requires<[In64BitMode]>; + def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst), + "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))], + IIC_CALL_MEM>, + Requires<[In64BitMode,FavorMemIndirectCall]>; + + def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst), + "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>; +} + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, + isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1, + SchedRW = [WriteJump] in { + def TCRETURNdi64 : PseudoI<(outs), + (ins i64i32imm_pcrel:$dst, i32imm:$offset), + []>; + def TCRETURNri64 : PseudoI<(outs), + (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>; + let mayLoad = 1 in + def TCRETURNmi64 : PseudoI<(outs), + (ins i64mem_TC:$dst, i32imm:$offset), []>; + + def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst), + "jmp\t$dst", [], IIC_JMP_REL>; + def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), + "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>; + + let mayLoad = 1 in + def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst), + "jmp{q}\t{*}$dst", [], IIC_JMP_MEM>; + + // Win64 wants jumps leaving the function to have a REX_W prefix. + let hasREX_WPrefix = 1 in { + def TAILJMPd64_REX : Ii32PCRel<0xE9, RawFrm, (outs), + (ins i64i32imm_pcrel:$dst), + "rex64 jmp\t$dst", [], IIC_JMP_REL>; + def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), + "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>; + + let mayLoad = 1 in + def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst), + "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>; + } +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td new file mode 100644 index 0000000..c4b2d6d --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td @@ -0,0 +1,182 @@ +//===-- X86InstrExtension.td - Sign and Zero Extensions ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the sign and zero extension operations. +// +//===----------------------------------------------------------------------===// + +let hasSideEffects = 0 in { + let Defs = [AX], Uses = [AL] in + def CBW : I<0x98, RawFrm, (outs), (ins), + "{cbtw|cbw}", [], IIC_CBW>, OpSize16; // AX = signext(AL) + let Defs = [EAX], Uses = [AX] in + def CWDE : I<0x98, RawFrm, (outs), (ins), + "{cwtl|cwde}", [], IIC_CBW>, OpSize32; // EAX = signext(AX) + + let Defs = [AX,DX], Uses = [AX] in + def CWD : I<0x99, RawFrm, (outs), (ins), + "{cwtd|cwd}", [], IIC_CBW>, OpSize16; // DX:AX = signext(AX) + let Defs = [EAX,EDX], Uses = [EAX] in + def CDQ : I<0x99, RawFrm, (outs), (ins), + "{cltd|cdq}", [], IIC_CBW>, OpSize32; // EDX:EAX = signext(EAX) + + + let Defs = [RAX], Uses = [EAX] in + def CDQE : RI<0x98, RawFrm, (outs), (ins), + "{cltq|cdqe}", [], IIC_CBW>; // RAX = signext(EAX) + + let Defs = [RAX,RDX], Uses = [RAX] in + def CQO : RI<0x99, RawFrm, (outs), (ins), + "{cqto|cqo}", [], IIC_CBW>; // RDX:RAX = signext(RAX) +} + + + +// Sign/Zero extenders +let hasSideEffects = 0 in { +def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), + "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>, + TB, OpSize16, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), + "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>, + TB, OpSize16, Sched<[WriteALULd]>; +} // hasSideEffects = 0 +def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src), + "movs{bl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB, + OpSize32, Sched<[WriteALU]>; +def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src), + "movs{bl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (sextloadi32i8 addr:$src))], IIC_MOVSX>, TB, + OpSize32, Sched<[WriteALULd]>; +def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src), + "movs{wl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (sext GR16:$src))], IIC_MOVSX>, TB, + OpSize32, Sched<[WriteALU]>; +def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), + "movs{wl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>, + OpSize32, TB, Sched<[WriteALULd]>; + +let hasSideEffects = 0 in { +def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), + "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>, + TB, OpSize16, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), + "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>, + TB, OpSize16, Sched<[WriteALULd]>; +} // hasSideEffects = 0 +def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src), + "movz{bl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB, + OpSize32, Sched<[WriteALU]>; +def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src), + "movz{bl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (zextloadi32i8 addr:$src))], IIC_MOVZX>, TB, + OpSize32, Sched<[WriteALULd]>; +def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src), + "movz{wl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (zext GR16:$src))], IIC_MOVZX>, TB, + OpSize32, Sched<[WriteALU]>; +def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), + "movz{wl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (zextloadi32i16 addr:$src))], IIC_MOVZX>, + TB, OpSize32, Sched<[WriteALULd]>; + +// These are the same as the regular MOVZX32rr8 and MOVZX32rm8 +// except that they use GR32_NOREX for the output operand register class +// instead of GR32. This allows them to operate on h registers on x86-64. +let hasSideEffects = 0, isCodeGenOnly = 1 in { +def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg, + (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src), + "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX", + [], IIC_MOVZX>, TB, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem, + (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src), + "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX", + [], IIC_MOVZX>, TB, Sched<[WriteALULd]>; + +def MOVSX32_NOREXrr8 : I<0xBE, MRMSrcReg, + (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src), + "movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX", + [], IIC_MOVSX>, TB, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVSX32_NOREXrm8 : I<0xBE, MRMSrcMem, + (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src), + "movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX", + [], IIC_MOVSX>, TB, Sched<[WriteALULd]>; +} + +// MOVSX64rr8 always has a REX prefix and it has an 8-bit register +// operand, which makes it a rare instruction with an 8-bit register +// operand that can never access an h register. If support for h registers +// were generalized, this would require a special register class. +def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src), + "movs{bq|x}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sext GR8:$src))], IIC_MOVSX>, TB, + Sched<[WriteALU]>; +def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src), + "movs{bq|x}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sextloadi64i8 addr:$src))], IIC_MOVSX>, + TB, Sched<[WriteALULd]>; +def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), + "movs{wq|x}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sext GR16:$src))], IIC_MOVSX>, TB, + Sched<[WriteALU]>; +def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), + "movs{wq|x}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sextloadi64i16 addr:$src))], IIC_MOVSX>, + TB, Sched<[WriteALULd]>; +def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src), + "movs{lq|xd}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>, + Sched<[WriteALU]>, Requires<[In64BitMode]>; +def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src), + "movs{lq|xd}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>, + Sched<[WriteALULd]>, Requires<[In64BitMode]>; + +// movzbq and movzwq encodings for the disassembler +def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src), + "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB, Sched<[WriteALU]>; +def MOVZX64rm8_Q : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src), + "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB, Sched<[WriteALULd]>; +def MOVZX64rr16_Q : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), + "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB, Sched<[WriteALU]>; +def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), + "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB, Sched<[WriteALULd]>; + +// 64-bit zero-extension patterns use SUBREG_TO_REG and an operation writing a +// 32-bit register. +def : Pat<(i64 (zext GR8:$src)), + (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8:$src), sub_32bit)>; +def : Pat<(zextloadi64i8 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; + +def : Pat<(i64 (zext GR16:$src)), + (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16:$src), sub_32bit)>; +def : Pat<(zextloadi64i16 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>; + +// The preferred way to do 32-bit-to-64-bit zero extension on x86-64 is to use a +// SUBREG_TO_REG to utilize implicit zero-extension, however this isn't possible +// when the 32-bit value is defined by a truncate or is copied from something +// where the high bits aren't necessarily all zero. In such cases, we fall back +// to these explicit zext instructions. +def : Pat<(i64 (zext GR32:$src)), + (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src), sub_32bit)>; +def : Pat<(i64 (zextloadi64i32 addr:$src)), + (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td new file mode 100644 index 0000000..fd800cf --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td @@ -0,0 +1,441 @@ +//===-- X86InstrFMA.td - FMA Instruction Set ---------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes FMA (Fused Multiply-Add) instructions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// FMA3 - Intel 3 operand Fused Multiply-Add instructions +//===----------------------------------------------------------------------===// + +// For all FMA opcodes declared in fma3p_rm and fma3s_rm milticlasses defined +// below, both the register and memory variants are commutable. +// For the register form the commutable operands are 1, 2 and 3. +// For the memory variant the folded operand must be in 3. Thus, +// in that case, only the operands 1 and 2 can be swapped. +// Commuting some of operands may require the opcode change. +// FMA*213*: +// operands 1 and 2 (memory & register forms): *213* --> *213*(no changes); +// operands 1 and 3 (register forms only): *213* --> *231*; +// operands 2 and 3 (register forms only): *213* --> *132*. +// FMA*132*: +// operands 1 and 2 (memory & register forms): *132* --> *231*; +// operands 1 and 3 (register forms only): *132* --> *132*(no changes); +// operands 2 and 3 (register forms only): *132* --> *213*. +// FMA*231*: +// operands 1 and 2 (memory & register forms): *231* --> *132*; +// operands 1 and 3 (register forms only): *231* --> *213*; +// operands 2 and 3 (register forms only): *231* --> *231*(no changes). + +let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in +multiclass fma3p_rm<bits<8> opc, string OpcodeStr, + PatFrag MemFrag128, PatFrag MemFrag256, + ValueType OpVT128, ValueType OpVT256, + SDPatternOperator Op = null_frag> { + let usesCustomInserter = 1 in + def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, (OpVT128 (Op VR128:$src2, + VR128:$src1, VR128:$src3)))]>; + + let mayLoad = 1 in + def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, f128mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1, + (MemFrag128 addr:$src3))))]>; + + let usesCustomInserter = 1 in + def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1, + VR256:$src3)))]>, VEX_L; + + let mayLoad = 1 in + def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, f256mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR256:$dst, + (OpVT256 (Op VR256:$src2, VR256:$src1, + (MemFrag256 addr:$src3))))]>, VEX_L; +} + +multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, + string OpcodeStr, string PackTy, + PatFrag MemFrag128, PatFrag MemFrag256, + SDNode Op, ValueType OpTy128, ValueType OpTy256> { + defm r213 : fma3p_rm<opc213, + !strconcat(OpcodeStr, "213", PackTy), + MemFrag128, MemFrag256, OpTy128, OpTy256, Op>; + defm r132 : fma3p_rm<opc132, + !strconcat(OpcodeStr, "132", PackTy), + MemFrag128, MemFrag256, OpTy128, OpTy256>; + defm r231 : fma3p_rm<opc231, + !strconcat(OpcodeStr, "231", PackTy), + MemFrag128, MemFrag256, OpTy128, OpTy256>; +} + +// Fused Multiply-Add +let ExeDomain = SSEPackedSingle in { + defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", loadv4f32, + loadv8f32, X86Fmadd, v4f32, v8f32>; + defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", loadv4f32, + loadv8f32, X86Fmsub, v4f32, v8f32>; + defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", + loadv4f32, loadv8f32, X86Fmaddsub, + v4f32, v8f32>; + defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps", + loadv4f32, loadv8f32, X86Fmsubadd, + v4f32, v8f32>; +} + +let ExeDomain = SSEPackedDouble in { + defm VFMADDPD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", loadv2f64, + loadv4f64, X86Fmadd, v2f64, v4f64>, VEX_W; + defm VFMSUBPD : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", loadv2f64, + loadv4f64, X86Fmsub, v2f64, v4f64>, VEX_W; + defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", + loadv2f64, loadv4f64, X86Fmaddsub, + v2f64, v4f64>, VEX_W; + defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd", + loadv2f64, loadv4f64, X86Fmsubadd, + v2f64, v4f64>, VEX_W; +} + +// Fused Negative Multiply-Add +let ExeDomain = SSEPackedSingle in { + defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", loadv4f32, + loadv8f32, X86Fnmadd, v4f32, v8f32>; + defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", loadv4f32, + loadv8f32, X86Fnmsub, v4f32, v8f32>; +} +let ExeDomain = SSEPackedDouble in { + defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", loadv2f64, + loadv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W; + defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", + loadv2f64, loadv4f64, X86Fnmsub, v2f64, + v4f64>, VEX_W; +} + +// All source register operands of FMA opcodes defined in fma3s_rm multiclass +// can be commuted. In many cases such commute transformation requres an opcode +// adjustment, for example, commuting the operands 1 and 2 in FMA*132 form +// would require an opcode change to FMA*231: +// FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2; +// --> +// FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2; +// Please see more detailed comment at the very beginning of the section +// defining FMA3 opcodes above. +let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in +multiclass fma3s_rm<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, + SDPatternOperator OpNode = null_frag> { + let usesCustomInserter = 1 in + def r : FMA3<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>; + + let mayLoad = 1 in + def m : FMA3<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, + (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>; +} + +// These FMA*_Int instructions are defined specially for being used when +// the scalar FMA intrinsics are lowered to machine instructions, and in that +// sense, they are similar to existing ADD*_Int, SUB*_Int, MUL*_Int, etc. +// instructions. +// +// All of the FMA*_Int opcodes are defined as commutable here. +// Commuting the 2nd and 3rd source register operands of FMAs is quite trivial +// and the corresponding optimizations have been developed. +// Commuting the 1st operand of FMA*_Int requires some additional analysis, +// the commute optimization is legal only if all users of FMA*_Int use only +// the lowest element of the FMA*_Int instruction. Even though such analysis +// may be not implemented yet we allow the routines doing the actual commute +// transformation to decide if one or another instruction is commutable or not. +let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1, + hasSideEffects = 0 in +multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, + Operand memopr, RegisterClass RC> { + def r_Int : FMA3<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>; + + let mayLoad = 1 in + def m_Int : FMA3<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, memopr:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>; +} + +multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, + string OpStr, string PackTy, + SDNode OpNode, RegisterClass RC, + X86MemOperand x86memop> { + defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), x86memop, RC>; + defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), x86memop, RC, + OpNode>; + defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), x86memop, RC>; +} + +// The FMA 213 form is created for lowering of scalar FMA intrinscis +// to machine instructions. +// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands +// of FMA 213 form. +// The FMA 231 form can be get only by commuting the 1st operand of 213 or 132 +// forms and is possible only after special analysis of all uses of the initial +// instruction. Such analysis do not exist yet and thus introducing the 231 +// form of FMA*_Int instructions is done using an optimistic assumption that +// such analysis will be implemented eventually. +multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, + string OpStr, string PackTy, + RegisterClass RC, Operand memop> { + defm r132 : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy), + memop, RC>; + defm r213 : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy), + memop, RC>; + defm r231 : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy), + memop, RC>; +} + +multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231, + string OpStr, Intrinsic IntF32, Intrinsic IntF64, + SDNode OpNode> { + let ExeDomain = SSEPackedSingle in + defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", OpNode, + FR32, f32mem>, + fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", VR128, ssmem>; + + let ExeDomain = SSEPackedDouble in + defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", OpNode, + FR64, f64mem>, + fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", VR128, sdmem>, + VEX_W; + + // These patterns use the 123 ordering, instead of 213, even though + // they match the intrinsic to the 213 version of the instruction. + // This is because src1 is tied to dest, and the scalar intrinsics + // require the pass-through values to come from the first source + // operand, not the second. + def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3), + (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SSr213r_Int") + $src1, $src2, $src3), VR128)>; + + def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3), + (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SDr213r_Int") + $src1, $src2, $src3), VR128)>; +} + +defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss, + int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG; +defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss, + int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG; + +defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss, + int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG; +defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss, + int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG; + + +//===----------------------------------------------------------------------===// +// FMA4 - AMD 4 operand Fused Multiply-Add instructions +//===----------------------------------------------------------------------===// + + +multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, ValueType OpVT, SDNode OpNode, + PatFrag mem_frag> { + let isCommutable = 1 in + def rr : FMA4<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG, MemOp4; + def rm : FMA4<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, (OpNode RC:$src1, RC:$src2, + (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG, MemOp4; + def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, + (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG; +// For disassembler +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in + def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, + VEX_LIG; +} + +multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop, + ComplexPattern mem_cpat, Intrinsic Int> { +let isCodeGenOnly = 1 in { + let isCommutable = 1 in + def rr_Int : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG, MemOp4; + def rm_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, + mem_cpat:$src3))]>, VEX_W, VEX_LIG, MemOp4; + def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, memop:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG; +} // isCodeGenOnly = 1 +} + +multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT128, ValueType OpVT256, + PatFrag ld_frag128, PatFrag ld_frag256> { + let isCommutable = 1 in + def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, + VEX_W, MemOp4; + def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, f128mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2, + (ld_frag128 addr:$src3)))]>, VEX_W, MemOp4; + def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>; + let isCommutable = 1 in + def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, + (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>, + VEX_W, MemOp4, VEX_L; + def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, f256mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2, + (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4, VEX_L; + def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, (OpNode VR256:$src1, + (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L; +// For disassembler +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { + def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>; + def rrY_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, + VEX_L; +} // isCodeGenOnly = 1 +} + +let ExeDomain = SSEPackedSingle in { + // Scalar Instructions + defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>, + fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32, + int_x86_fma_vfmadd_ss>; + defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>, + fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32, + int_x86_fma_vfmsub_ss>; + defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32, + X86Fnmadd, loadf32>, + fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32, + int_x86_fma_vfnmadd_ss>; + defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32, + X86Fnmsub, loadf32>, + fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32, + int_x86_fma_vfnmsub_ss>; + // Packed Instructions + defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32, + loadv4f32, loadv8f32>; + defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32, + loadv4f32, loadv8f32>; + defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32, + loadv4f32, loadv8f32>; + defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32, + loadv4f32, loadv8f32>; + defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32, + loadv4f32, loadv8f32>; + defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32, + loadv4f32, loadv8f32>; +} + +let ExeDomain = SSEPackedDouble in { + // Scalar Instructions + defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>, + fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64, + int_x86_fma_vfmadd_sd>; + defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>, + fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64, + int_x86_fma_vfmsub_sd>; + defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64, + X86Fnmadd, loadf64>, + fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64, + int_x86_fma_vfnmadd_sd>; + defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, + X86Fnmsub, loadf64>, + fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64, + int_x86_fma_vfnmsub_sd>; + // Packed Instructions + defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64, + loadv2f64, loadv4f64>; + defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64, + loadv2f64, loadv4f64>; + defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64, + loadv2f64, loadv4f64>; + defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64, + loadv2f64, loadv4f64>; + defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64, + loadv2f64, loadv4f64>; + defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64, + loadv2f64, loadv4f64>; +} + diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td new file mode 100644 index 0000000..03ae211 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td @@ -0,0 +1,729 @@ +//===- X86InstrFPStack.td - FPU Instruction Set ------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 x87 FPU instruction set, defining the +// instructions, and properties of the instructions which are needed for code +// generation, machine code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// FPStack specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDTX86FpGet2 : SDTypeProfile<2, 0, [SDTCisVT<0, f80>, + SDTCisVT<1, f80>]>; +def SDTX86Fld : SDTypeProfile<1, 2, [SDTCisFP<0>, + SDTCisPtrTy<1>, + SDTCisVT<2, OtherVT>]>; +def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>, + SDTCisPtrTy<1>, + SDTCisVT<2, OtherVT>]>; +def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>, + SDTCisVT<2, OtherVT>]>; +def SDTX86Fnstsw : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>; +def SDTX86FpToIMem : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>; + +def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; + +def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86fst : SDNode<"X86ISD::FST", SDTX86Fst, + [SDNPHasChain, SDNPInGlue, SDNPMayStore, + SDNPMemOperand]>; +def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild, + [SDNPHasChain, SDNPOutGlue, SDNPMayLoad, + SDNPMemOperand]>; +def X86fp_stsw : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>; +def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore, + [SDNPHasChain, SDNPMayStore, SDNPSideEffect, + SDNPMemOperand]>; + +//===----------------------------------------------------------------------===// +// FPStack pattern fragments +//===----------------------------------------------------------------------===// + +def fpimm0 : PatLeaf<(fpimm), [{ + return N->isExactlyValue(+0.0); +}]>; + +def fpimmneg0 : PatLeaf<(fpimm), [{ + return N->isExactlyValue(-0.0); +}]>; + +def fpimm1 : PatLeaf<(fpimm), [{ + return N->isExactlyValue(+1.0); +}]>; + +def fpimmneg1 : PatLeaf<(fpimm), [{ + return N->isExactlyValue(-1.0); +}]>; + +// Some 'special' instructions +let usesCustomInserter = 1 in { // Expanded after instruction selection. + def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src), + [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>; + def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src), + [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>; + def FP32_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP32:$src), + [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>; + def FP64_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP64:$src), + [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>; + def FP64_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP64:$src), + [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>; + def FP64_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP64:$src), + [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>; + def FP80_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP80:$src), + [(X86fp_to_i16mem RFP80:$src, addr:$dst)]>; + def FP80_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP80:$src), + [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>; + def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src), + [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>; +} + +// All FP Stack operations are represented with four instructions here. The +// first three instructions, generated by the instruction selector, use "RFP32" +// "RFP64" or "RFP80" registers: traditional register files to reference 32-bit, +// 64-bit or 80-bit floating point values. These sizes apply to the values, +// not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be +// copied to each other without losing information. These instructions are all +// pseudo instructions and use the "_Fp" suffix. +// In some cases there are additional variants with a mixture of different +// register sizes. +// The second instruction is defined with FPI, which is the actual instruction +// emitted by the assembler. These use "RST" registers, although frequently +// the actual register(s) used are implicit. These are always 80 bits. +// The FP stackifier pass converts one to the other after register allocation +// occurs. +// +// Note that the FpI instruction should have instruction selection info (e.g. +// a pattern) and the FPI instruction should have emission info (e.g. opcode +// encoding and asm printing info). + +// FpIf32, FpIf64 - Floating Point Pseudo Instruction template. +// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1. +// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2. +// f80 instructions cannot use SSE and use neither of these. +class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> : + FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>; +class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> : + FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>; + +// Factoring for arithmetic. +multiclass FPBinary_rr<SDNode OpNode> { +// Register op register -> register +// These are separated out because they have no reversed form. +def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP, + [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>; +def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), TwoArgFP, + [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>; +def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP, + [(set RFP80:$dst, (OpNode RFP80:$src1, RFP80:$src2))]>; +} +// The FopST0 series are not included here because of the irregularities +// in where the 'r' goes in assembly output. +// These instructions cannot address 80-bit memory. +multiclass FPBinary<SDNode OpNode, Format fp, string asmstring, + bit Forward = 1> { +// ST(0) = ST(0) + [mem] +def _Fp32m : FpIf32<(outs RFP32:$dst), + (ins RFP32:$src1, f32mem:$src2), OneArgFPRW, + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (loadf32 addr:$src2))), + (set RFP32:$dst, + (OpNode (loadf32 addr:$src2), RFP32:$src1)))]>; +def _Fp64m : FpIf64<(outs RFP64:$dst), + (ins RFP64:$src1, f64mem:$src2), OneArgFPRW, + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (loadf64 addr:$src2))), + (set RFP64:$dst, + (OpNode (loadf64 addr:$src2), RFP64:$src1)))]>; +def _Fp64m32: FpIf64<(outs RFP64:$dst), + (ins RFP64:$src1, f32mem:$src2), OneArgFPRW, + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2)))), + (set RFP64:$dst, + (OpNode (f64 (extloadf32 addr:$src2)), RFP64:$src1)))]>; +def _Fp80m32: FpI_<(outs RFP80:$dst), + (ins RFP80:$src1, f32mem:$src2), OneArgFPRW, + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2)))), + (set RFP80:$dst, + (OpNode (f80 (extloadf32 addr:$src2)), RFP80:$src1)))]>; +def _Fp80m64: FpI_<(outs RFP80:$dst), + (ins RFP80:$src1, f64mem:$src2), OneArgFPRW, + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))), + (set RFP80:$dst, + (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>; +let mayLoad = 1 in +def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src), + !strconcat("f", asmstring, "{s}\t$src")>; +let mayLoad = 1 in +def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src), + !strconcat("f", asmstring, "{l}\t$src")>; +// ST(0) = ST(0) + [memint] +def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), + OneArgFPRW, + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (X86fild addr:$src2, i16))), + (set RFP32:$dst, + (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>; +def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), + OneArgFPRW, + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (X86fild addr:$src2, i32))), + (set RFP32:$dst, + (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>; +def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), + OneArgFPRW, + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (X86fild addr:$src2, i16))), + (set RFP64:$dst, + (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>; +def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), + OneArgFPRW, + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (X86fild addr:$src2, i32))), + (set RFP64:$dst, + (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>; +def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), + OneArgFPRW, + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (X86fild addr:$src2, i16))), + (set RFP80:$dst, + (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>; +def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), + OneArgFPRW, + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (X86fild addr:$src2, i32))), + (set RFP80:$dst, + (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>; +let mayLoad = 1 in +def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src), + !strconcat("fi", asmstring, "{s}\t$src")>; +let mayLoad = 1 in +def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), + !strconcat("fi", asmstring, "{l}\t$src")>; +} + +let Defs = [FPSW] in { +// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling +// resources. +defm ADD : FPBinary_rr<fadd>; +defm SUB : FPBinary_rr<fsub>; +defm MUL : FPBinary_rr<fmul>; +defm DIV : FPBinary_rr<fdiv>; +// Sets the scheduling resources for the actual NAME#_F<size>m defintions. +let SchedRW = [WriteFAddLd] in { +defm ADD : FPBinary<fadd, MRM0m, "add">; +defm SUB : FPBinary<fsub, MRM4m, "sub">; +defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>; +} +let SchedRW = [WriteFMulLd] in { +defm MUL : FPBinary<fmul, MRM1m, "mul">; +} +let SchedRW = [WriteFDivLd] in { +defm DIV : FPBinary<fdiv, MRM6m, "div">; +defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>; +} +} + +class FPST0rInst<Format fp, string asm> + : FPI<0xD8, fp, (outs), (ins RST:$op), asm>; +class FPrST0Inst<Format fp, string asm> + : FPI<0xDC, fp, (outs), (ins RST:$op), asm>; +class FPrST0PInst<Format fp, string asm> + : FPI<0xDE, fp, (outs), (ins RST:$op), asm>; + +// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion +// of some of the 'reverse' forms of the fsub and fdiv instructions. As such, +// we have to put some 'r's in and take them out of weird places. +let SchedRW = [WriteFAdd] in { +def ADD_FST0r : FPST0rInst <MRM0r, "fadd\t$op">; +def ADD_FrST0 : FPrST0Inst <MRM0r, "fadd\t{%st(0), $op|$op, st(0)}">; +def ADD_FPrST0 : FPrST0PInst<MRM0r, "faddp\t$op">; +def SUBR_FST0r : FPST0rInst <MRM5r, "fsubr\t$op">; +def SUB_FrST0 : FPrST0Inst <MRM5r, "fsub{r}\t{%st(0), $op|$op, st(0)}">; +def SUB_FPrST0 : FPrST0PInst<MRM5r, "fsub{r}p\t$op">; +def SUB_FST0r : FPST0rInst <MRM4r, "fsub\t$op">; +def SUBR_FrST0 : FPrST0Inst <MRM4r, "fsub{|r}\t{%st(0), $op|$op, st(0)}">; +def SUBR_FPrST0 : FPrST0PInst<MRM4r, "fsub{|r}p\t$op">; +} // SchedRW +let SchedRW = [WriteFMul] in { +def MUL_FST0r : FPST0rInst <MRM1r, "fmul\t$op">; +def MUL_FrST0 : FPrST0Inst <MRM1r, "fmul\t{%st(0), $op|$op, st(0)}">; +def MUL_FPrST0 : FPrST0PInst<MRM1r, "fmulp\t$op">; +} // SchedRW +let SchedRW = [WriteFDiv] in { +def DIVR_FST0r : FPST0rInst <MRM7r, "fdivr\t$op">; +def DIV_FrST0 : FPrST0Inst <MRM7r, "fdiv{r}\t{%st(0), $op|$op, st(0)}">; +def DIV_FPrST0 : FPrST0PInst<MRM7r, "fdiv{r}p\t$op">; +def DIV_FST0r : FPST0rInst <MRM6r, "fdiv\t$op">; +def DIVR_FrST0 : FPrST0Inst <MRM6r, "fdiv{|r}\t{%st(0), $op|$op, st(0)}">; +def DIVR_FPrST0 : FPrST0PInst<MRM6r, "fdiv{|r}p\t$op">; +} // SchedRW + +def COM_FST0r : FPST0rInst <MRM2r, "fcom\t$op">; +def COMP_FST0r : FPST0rInst <MRM3r, "fcomp\t$op">; + +// Unary operations. +multiclass FPUnary<SDNode OpNode, Format fp, string asmstring> { +def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW, + [(set RFP32:$dst, (OpNode RFP32:$src))]>; +def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW, + [(set RFP64:$dst, (OpNode RFP64:$src))]>; +def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW, + [(set RFP80:$dst, (OpNode RFP80:$src))]>; +def _F : FPI<0xD9, fp, (outs), (ins), asmstring>; +} + +let Defs = [FPSW] in { +defm CHS : FPUnary<fneg, MRM_E0, "fchs">; +defm ABS : FPUnary<fabs, MRM_E1, "fabs">; +let SchedRW = [WriteFSqrt] in { +defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt">; +} +defm SIN : FPUnary<fsin, MRM_FE, "fsin">; +defm COS : FPUnary<fcos, MRM_FF, "fcos">; + +let hasSideEffects = 0 in { +def TST_Fp32 : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>; +def TST_Fp64 : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>; +def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>; +} +def TST_F : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">; +} // Defs = [FPSW] + +// Versions of FP instructions that take a single memory operand. Added for the +// disassembler; remove as they are included with patterns elsewhere. +def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">; +def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">; + +def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">; +def FSTENVm : FPI<0xD9, MRM6m, (outs f32mem:$dst), (ins), "fnstenv\t$dst">; + +def FICOM32m : FPI<0xDA, MRM2m, (outs), (ins i32mem:$src), "ficom{l}\t$src">; +def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">; + +def FCOM64m : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{l}\t$src">; +def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">; + +def FRSTORm : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">; +def FSAVEm : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fnsave\t$dst">; +def FNSTSWm : FPI<0xDD, MRM7m, (outs i16mem:$dst), (ins), "fnstsw\t$dst">; + +def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">; +def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">; + +def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">; +def FBSTPm : FPI<0xDF, MRM6m, (outs f80mem:$dst), (ins), "fbstp\t$dst">; + +// Floating point cmovs. +class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> : + FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMov]>; +class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> : + FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMov]>; + +multiclass FPCMov<PatLeaf cc> { + def _Fp32 : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), + CondMovFP, + [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2, + cc, EFLAGS))]>; + def _Fp64 : FpIf64CMov<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), + CondMovFP, + [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2, + cc, EFLAGS))]>; + def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), + CondMovFP, + [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2, + cc, EFLAGS))]>, + Requires<[HasCMov]>; +} + +let Defs = [FPSW] in { +let Uses = [EFLAGS], Constraints = "$src1 = $dst" in { +defm CMOVB : FPCMov<X86_COND_B>; +defm CMOVBE : FPCMov<X86_COND_BE>; +defm CMOVE : FPCMov<X86_COND_E>; +defm CMOVP : FPCMov<X86_COND_P>; +defm CMOVNB : FPCMov<X86_COND_AE>; +defm CMOVNBE: FPCMov<X86_COND_A>; +defm CMOVNE : FPCMov<X86_COND_NE>; +defm CMOVNP : FPCMov<X86_COND_NP>; +} // Uses = [EFLAGS], Constraints = "$src1 = $dst" + +let Predicates = [HasCMov] in { +// These are not factored because there's no clean way to pass DA/DB. +def CMOVB_F : FPI<0xDA, MRM0r, (outs), (ins RST:$op), + "fcmovb\t{$op, %st(0)|st(0), $op}">; +def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RST:$op), + "fcmovbe\t{$op, %st(0)|st(0), $op}">; +def CMOVE_F : FPI<0xDA, MRM1r, (outs), (ins RST:$op), + "fcmove\t{$op, %st(0)|st(0), $op}">; +def CMOVP_F : FPI<0xDA, MRM3r, (outs), (ins RST:$op), + "fcmovu\t{$op, %st(0)|st(0), $op}">; +def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RST:$op), + "fcmovnb\t{$op, %st(0)|st(0), $op}">; +def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RST:$op), + "fcmovnbe\t{$op, %st(0)|st(0), $op}">; +def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RST:$op), + "fcmovne\t{$op, %st(0)|st(0), $op}">; +def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RST:$op), + "fcmovnu\t{$op, %st(0)|st(0), $op}">; +} // Predicates = [HasCMov] + +// Floating point loads & stores. +let canFoldAsLoad = 1 in { +def LD_Fp32m : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP, + [(set RFP32:$dst, (loadf32 addr:$src))]>; +let isReMaterializable = 1 in + def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP, + [(set RFP64:$dst, (loadf64 addr:$src))]>; +def LD_Fp80m : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP, + [(set RFP80:$dst, (loadf80 addr:$src))]>; +} +def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP, + [(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>; +def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP, + [(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>; +def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP, + [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>; +def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP, + [(set RFP32:$dst, (X86fild addr:$src, i16))]>; +def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP, + [(set RFP32:$dst, (X86fild addr:$src, i32))]>; +def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP, + [(set RFP32:$dst, (X86fild addr:$src, i64))]>; +def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP, + [(set RFP64:$dst, (X86fild addr:$src, i16))]>; +def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP, + [(set RFP64:$dst, (X86fild addr:$src, i32))]>; +def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP, + [(set RFP64:$dst, (X86fild addr:$src, i64))]>; +def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP, + [(set RFP80:$dst, (X86fild addr:$src, i16))]>; +def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP, + [(set RFP80:$dst, (X86fild addr:$src, i32))]>; +def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP, + [(set RFP80:$dst, (X86fild addr:$src, i64))]>; + +def ST_Fp32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, + [(store RFP32:$src, addr:$op)]>; +def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, + [(truncstoref32 RFP64:$src, addr:$op)]>; +def ST_Fp64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, + [(store RFP64:$src, addr:$op)]>; +def ST_Fp80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, + [(truncstoref32 RFP80:$src, addr:$op)]>; +def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, + [(truncstoref64 RFP80:$src, addr:$op)]>; +// FST does not support 80-bit memory target; FSTP must be used. + +let mayStore = 1, hasSideEffects = 0 in { +def ST_FpP32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>; +def ST_FpP64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>; +def ST_FpP64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>; +def ST_FpP80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>; +def ST_FpP80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>; +} +def ST_FpP80m : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP, + [(store RFP80:$src, addr:$op)]>; +let mayStore = 1, hasSideEffects = 0 in { +def IST_Fp16m32 : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>; +def IST_Fp32m32 : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>; +def IST_Fp64m32 : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>; +def IST_Fp16m64 : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>; +def IST_Fp32m64 : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>; +def IST_Fp64m64 : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>; +def IST_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>; +def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>; +def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>; +} + +let mayLoad = 1, SchedRW = [WriteLoad] in { +def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src", + IIC_FLD>; +def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src", + IIC_FLD>; +def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src", + IIC_FLD80>; +def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src", + IIC_FILD>; +def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src", + IIC_FILD>; +def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src", + IIC_FILD>; +} +let mayStore = 1, SchedRW = [WriteStore] in { +def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst", + IIC_FST>; +def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst", + IIC_FST>; +def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst", + IIC_FST>; +def ST_FP64m : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst", + IIC_FST>; +def ST_FP80m : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst", + IIC_FST80>; +def IST_F16m : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst", + IIC_FIST>; +def IST_F32m : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst", + IIC_FIST>; +def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst", + IIC_FIST>; +def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst", + IIC_FIST>; +def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst", + IIC_FIST>; +} + +// FISTTP requires SSE3 even though it's a FPStack op. +let Predicates = [HasSSE3] in { +def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, + [(X86fp_to_i16mem RFP32:$src, addr:$op)]>; +def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, + [(X86fp_to_i32mem RFP32:$src, addr:$op)]>; +def ISTT_Fp64m32 : FpI_<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, + [(X86fp_to_i64mem RFP32:$src, addr:$op)]>; +def ISTT_Fp16m64 : FpI_<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, + [(X86fp_to_i16mem RFP64:$src, addr:$op)]>; +def ISTT_Fp32m64 : FpI_<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, + [(X86fp_to_i32mem RFP64:$src, addr:$op)]>; +def ISTT_Fp64m64 : FpI_<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, + [(X86fp_to_i64mem RFP64:$src, addr:$op)]>; +def ISTT_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, + [(X86fp_to_i16mem RFP80:$src, addr:$op)]>; +def ISTT_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, + [(X86fp_to_i32mem RFP80:$src, addr:$op)]>; +def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, + [(X86fp_to_i64mem RFP80:$src, addr:$op)]>; +} // Predicates = [HasSSE3] + +let mayStore = 1, SchedRW = [WriteStore] in { +def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst", + IIC_FST>; +def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst", + IIC_FST>; +def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), + "fisttp{ll}\t$dst", IIC_FST>; +} + +// FP Stack manipulation instructions. +let SchedRW = [WriteMove] in { +def LD_Frr : FPI<0xD9, MRM0r, (outs), (ins RST:$op), "fld\t$op", IIC_FLD>; +def ST_Frr : FPI<0xDD, MRM2r, (outs), (ins RST:$op), "fst\t$op", IIC_FST>; +def ST_FPrr : FPI<0xDD, MRM3r, (outs), (ins RST:$op), "fstp\t$op", IIC_FST>; +def XCH_F : FPI<0xD9, MRM1r, (outs), (ins RST:$op), "fxch\t$op", IIC_FXCH>; +} + +// Floating point constant loads. +let isReMaterializable = 1 in { +def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP, + [(set RFP32:$dst, fpimm0)]>; +def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP, + [(set RFP32:$dst, fpimm1)]>; +def LD_Fp064 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP, + [(set RFP64:$dst, fpimm0)]>; +def LD_Fp164 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP, + [(set RFP64:$dst, fpimm1)]>; +def LD_Fp080 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP, + [(set RFP80:$dst, fpimm0)]>; +def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP, + [(set RFP80:$dst, fpimm1)]>; +} + +let SchedRW = [WriteZero] in { +def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz", IIC_FLDZ>; +def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1", IIC_FIST>; +} + +// Floating point compares. +let SchedRW = [WriteFAdd] in { +def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, + [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>; +def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, + [(set FPSW, (trunc (X86cmp RFP64:$lhs, RFP64:$rhs)))]>; +def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, + [(set FPSW, (trunc (X86cmp RFP80:$lhs, RFP80:$rhs)))]>; +} // SchedRW +} // Defs = [FPSW] + +let SchedRW = [WriteFAdd] in { +// CC = ST(0) cmp ST(i) +let Defs = [EFLAGS, FPSW] in { +def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, + [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>; +def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, + [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>; +def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, + [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>; +} + +let Defs = [FPSW], Uses = [ST0] in { +def UCOM_Fr : FPI<0xDD, MRM4r, // FPSW = cmp ST(0) with ST(i) + (outs), (ins RST:$reg), "fucom\t$reg", IIC_FUCOM>; +def UCOM_FPr : FPI<0xDD, MRM5r, // FPSW = cmp ST(0) with ST(i), pop + (outs), (ins RST:$reg), "fucomp\t$reg", IIC_FUCOM>; +def UCOM_FPPr : FPI<0xDA, MRM_E9, // cmp ST(0) with ST(1), pop, pop + (outs), (ins), "fucompp", IIC_FUCOM>; +} + +let Defs = [EFLAGS, FPSW], Uses = [ST0] in { +def UCOM_FIr : FPI<0xDB, MRM5r, // CC = cmp ST(0) with ST(i) + (outs), (ins RST:$reg), "fucomi\t$reg", IIC_FUCOMI>; +def UCOM_FIPr : FPI<0xDF, MRM5r, // CC = cmp ST(0) with ST(i), pop + (outs), (ins RST:$reg), "fucompi\t$reg", IIC_FUCOMI>; +} + +let Defs = [EFLAGS, FPSW] in { +def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RST:$reg), + "fcomi\t$reg", IIC_FCOMI>; +def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RST:$reg), + "fcompi\t$reg", IIC_FCOMI>; +} +} // SchedRW + +// Floating point flag ops. +let SchedRW = [WriteALU] in { +let Defs = [AX], Uses = [FPSW] in +def FNSTSW16r : I<0xDF, MRM_E0, // AX = fp flags + (outs), (ins), "fnstsw\t{%ax|ax}", + [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>; + +def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world + (outs), (ins i16mem:$dst), "fnstcw\t$dst", + [(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>; +} // SchedRW +let mayLoad = 1 in +def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16] + (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>, + Sched<[WriteLoad]>; + +// FPU control instructions +let SchedRW = [WriteMicrocoded] in { +let Defs = [FPSW] in +def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", [], IIC_FNINIT>; +def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg), + "ffree\t$reg", IIC_FFREE>; +// Clear exceptions + +let Defs = [FPSW] in +def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", [], IIC_FNCLEX>; +} // SchedRW + +// Operandless floating-point instructions for the disassembler. +let SchedRW = [WriteMicrocoded] in { +def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>; + +def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", [], IIC_FNOP>; +def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", [], IIC_FXAM>; +def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", [], IIC_FLDL>; +def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", [], IIC_FLDL>; +def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", [], IIC_FLDL>; +def FLDLG2 : I<0xD9, MRM_EC, (outs), (ins), "fldlg2", [], IIC_FLDL>; +def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", [], IIC_FLDL>; +def F2XM1 : I<0xD9, MRM_F0, (outs), (ins), "f2xm1", [], IIC_F2XM1>; +def FYL2X : I<0xD9, MRM_F1, (outs), (ins), "fyl2x", [], IIC_FYL2X>; +def FPTAN : I<0xD9, MRM_F2, (outs), (ins), "fptan", [], IIC_FPTAN>; +def FPATAN : I<0xD9, MRM_F3, (outs), (ins), "fpatan", [], IIC_FPATAN>; +def FXTRACT : I<0xD9, MRM_F4, (outs), (ins), "fxtract", [], IIC_FXTRACT>; +def FPREM1 : I<0xD9, MRM_F5, (outs), (ins), "fprem1", [], IIC_FPREM1>; +def FDECSTP : I<0xD9, MRM_F6, (outs), (ins), "fdecstp", [], IIC_FPSTP>; +def FINCSTP : I<0xD9, MRM_F7, (outs), (ins), "fincstp", [], IIC_FPSTP>; +def FPREM : I<0xD9, MRM_F8, (outs), (ins), "fprem", [], IIC_FPREM>; +def FYL2XP1 : I<0xD9, MRM_F9, (outs), (ins), "fyl2xp1", [], IIC_FYL2XP1>; +def FSINCOS : I<0xD9, MRM_FB, (outs), (ins), "fsincos", [], IIC_FSINCOS>; +def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", [], IIC_FRNDINT>; +def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", [], IIC_FSCALE>; +def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>; + +let Predicates = [HasFXSR] in { + def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), + "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB; + def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), + "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)], + IIC_FXSAVE>, TB, Requires<[In64BitMode]>; + def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), + "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB; + def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src), + "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)], + IIC_FXRSTOR>, TB, Requires<[In64BitMode]>; +} // Predicates = [FeatureFXSR] +} // SchedRW + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +//===----------------------------------------------------------------------===// + +// Required for RET of f32 / f64 / f80 values. +def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>; +def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>; +def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>; + +// Required for CALL which return f32 / f64 / f80 values. +def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>; +def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, + RFP64:$src)>; +def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>; +def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op, + RFP80:$src)>; +def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op, + RFP80:$src)>; +def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op, + RFP80:$src)>; + +// Floating point constant -0.0 and -1.0 +def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>; +def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStackf32]>; +def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStackf64]>; +def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>; +def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>; +def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>; + +// Used to conv. i64 to f64 since there isn't a SSE version. +def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>; + +// FP extensions map onto simple pseudo-value conversions if they are to/from +// the FP stack. +def : Pat<(f64 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>, + Requires<[FPStackf32]>; +def : Pat<(f80 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>, + Requires<[FPStackf32]>; +def : Pat<(f80 (fextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>, + Requires<[FPStackf64]>; + +// FP truncations map onto simple pseudo-value conversions if they are to/from +// the FP stack. We have validated that only value-preserving truncations make +// it through isel. +def : Pat<(f32 (fround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>, + Requires<[FPStackf32]>; +def : Pat<(f32 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>, + Requires<[FPStackf32]>; +def : Pat<(f64 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>, + Requires<[FPStackf64]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td new file mode 100644 index 0000000..e2fa295 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td @@ -0,0 +1,948 @@ +//===-- X86InstrFormats.td - X86 Instruction Formats -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// X86 Instruction Format Definitions. +// + +// Format specifies the encoding used by the instruction. This is part of the +// ad-hoc solution used to emit machine instruction encodings by our machine +// code emitter. +class Format<bits<7> val> { + bits<7> Value = val; +} + +def Pseudo : Format<0>; def RawFrm : Format<1>; +def AddRegFrm : Format<2>; def MRMDestReg : Format<3>; +def MRMDestMem : Format<4>; def MRMSrcReg : Format<5>; +def MRMSrcMem : Format<6>; def RawFrmMemOffs : Format<7>; +def RawFrmSrc : Format<8>; def RawFrmDst : Format<9>; +def RawFrmDstSrc: Format<10>; +def RawFrmImm8 : Format<11>; +def RawFrmImm16 : Format<12>; +def MRMXr : Format<14>; def MRMXm : Format<15>; +def MRM0r : Format<16>; def MRM1r : Format<17>; def MRM2r : Format<18>; +def MRM3r : Format<19>; def MRM4r : Format<20>; def MRM5r : Format<21>; +def MRM6r : Format<22>; def MRM7r : Format<23>; +def MRM0m : Format<24>; def MRM1m : Format<25>; def MRM2m : Format<26>; +def MRM3m : Format<27>; def MRM4m : Format<28>; def MRM5m : Format<29>; +def MRM6m : Format<30>; def MRM7m : Format<31>; +def MRM_C0 : Format<32>; def MRM_C1 : Format<33>; def MRM_C2 : Format<34>; +def MRM_C3 : Format<35>; def MRM_C4 : Format<36>; def MRM_C5 : Format<37>; +def MRM_C6 : Format<38>; def MRM_C7 : Format<39>; def MRM_C8 : Format<40>; +def MRM_C9 : Format<41>; def MRM_CA : Format<42>; def MRM_CB : Format<43>; +def MRM_CC : Format<44>; def MRM_CD : Format<45>; def MRM_CE : Format<46>; +def MRM_CF : Format<47>; def MRM_D0 : Format<48>; def MRM_D1 : Format<49>; +def MRM_D2 : Format<50>; def MRM_D3 : Format<51>; def MRM_D4 : Format<52>; +def MRM_D5 : Format<53>; def MRM_D6 : Format<54>; def MRM_D7 : Format<55>; +def MRM_D8 : Format<56>; def MRM_D9 : Format<57>; def MRM_DA : Format<58>; +def MRM_DB : Format<59>; def MRM_DC : Format<60>; def MRM_DD : Format<61>; +def MRM_DE : Format<62>; def MRM_DF : Format<63>; def MRM_E0 : Format<64>; +def MRM_E1 : Format<65>; def MRM_E2 : Format<66>; def MRM_E3 : Format<67>; +def MRM_E4 : Format<68>; def MRM_E5 : Format<69>; def MRM_E6 : Format<70>; +def MRM_E7 : Format<71>; def MRM_E8 : Format<72>; def MRM_E9 : Format<73>; +def MRM_EA : Format<74>; def MRM_EB : Format<75>; def MRM_EC : Format<76>; +def MRM_ED : Format<77>; def MRM_EE : Format<78>; def MRM_EF : Format<79>; +def MRM_F0 : Format<80>; def MRM_F1 : Format<81>; def MRM_F2 : Format<82>; +def MRM_F3 : Format<83>; def MRM_F4 : Format<84>; def MRM_F5 : Format<85>; +def MRM_F6 : Format<86>; def MRM_F7 : Format<87>; def MRM_F8 : Format<88>; +def MRM_F9 : Format<89>; def MRM_FA : Format<90>; def MRM_FB : Format<91>; +def MRM_FC : Format<92>; def MRM_FD : Format<93>; def MRM_FE : Format<94>; +def MRM_FF : Format<95>; + +// ImmType - This specifies the immediate type used by an instruction. This is +// part of the ad-hoc solution used to emit machine instruction encodings by our +// machine code emitter. +class ImmType<bits<4> val> { + bits<4> Value = val; +} +def NoImm : ImmType<0>; +def Imm8 : ImmType<1>; +def Imm8PCRel : ImmType<2>; +def Imm16 : ImmType<3>; +def Imm16PCRel : ImmType<4>; +def Imm32 : ImmType<5>; +def Imm32PCRel : ImmType<6>; +def Imm32S : ImmType<7>; +def Imm64 : ImmType<8>; + +// FPFormat - This specifies what form this FP instruction has. This is used by +// the Floating-Point stackifier pass. +class FPFormat<bits<3> val> { + bits<3> Value = val; +} +def NotFP : FPFormat<0>; +def ZeroArgFP : FPFormat<1>; +def OneArgFP : FPFormat<2>; +def OneArgFPRW : FPFormat<3>; +def TwoArgFP : FPFormat<4>; +def CompareFP : FPFormat<5>; +def CondMovFP : FPFormat<6>; +def SpecialFP : FPFormat<7>; + +// Class specifying the SSE execution domain, used by the SSEDomainFix pass. +// Keep in sync with tables in X86InstrInfo.cpp. +class Domain<bits<2> val> { + bits<2> Value = val; +} +def GenericDomain : Domain<0>; +def SSEPackedSingle : Domain<1>; +def SSEPackedDouble : Domain<2>; +def SSEPackedInt : Domain<3>; + +// Class specifying the vector form of the decompressed +// displacement of 8-bit. +class CD8VForm<bits<3> val> { + bits<3> Value = val; +} +def CD8VF : CD8VForm<0>; // v := VL +def CD8VH : CD8VForm<1>; // v := VL/2 +def CD8VQ : CD8VForm<2>; // v := VL/4 +def CD8VO : CD8VForm<3>; // v := VL/8 +// The tuple (subvector) forms. +def CD8VT1 : CD8VForm<4>; // v := 1 +def CD8VT2 : CD8VForm<5>; // v := 2 +def CD8VT4 : CD8VForm<6>; // v := 4 +def CD8VT8 : CD8VForm<7>; // v := 8 + +// Class specifying the prefix used an opcode extension. +class Prefix<bits<3> val> { + bits<3> Value = val; +} +def NoPrfx : Prefix<0>; +def PS : Prefix<1>; +def PD : Prefix<2>; +def XS : Prefix<3>; +def XD : Prefix<4>; + +// Class specifying the opcode map. +class Map<bits<3> val> { + bits<3> Value = val; +} +def OB : Map<0>; +def TB : Map<1>; +def T8 : Map<2>; +def TA : Map<3>; +def XOP8 : Map<4>; +def XOP9 : Map<5>; +def XOPA : Map<6>; + +// Class specifying the encoding +class Encoding<bits<2> val> { + bits<2> Value = val; +} +def EncNormal : Encoding<0>; +def EncVEX : Encoding<1>; +def EncXOP : Encoding<2>; +def EncEVEX : Encoding<3>; + +// Operand size for encodings that change based on mode. +class OperandSize<bits<2> val> { + bits<2> Value = val; +} +def OpSizeFixed : OperandSize<0>; // Never needs a 0x66 prefix. +def OpSize16 : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode. +def OpSize32 : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode. + +// Address size for encodings that change based on mode. +class AddressSize<bits<2> val> { + bits<2> Value = val; +} +def AdSizeX : AddressSize<0>; // Address size determined using addr operand. +def AdSize16 : AddressSize<1>; // Encodes a 16-bit address. +def AdSize32 : AddressSize<2>; // Encodes a 32-bit address. +def AdSize64 : AddressSize<3>; // Encodes a 64-bit address. + +// Prefix byte classes which are used to indicate to the ad-hoc machine code +// emitter that various prefix bytes are required. +class OpSize16 { OperandSize OpSize = OpSize16; } +class OpSize32 { OperandSize OpSize = OpSize32; } +class AdSize16 { AddressSize AdSize = AdSize16; } +class AdSize32 { AddressSize AdSize = AdSize32; } +class AdSize64 { AddressSize AdSize = AdSize64; } +class REX_W { bit hasREX_WPrefix = 1; } +class LOCK { bit hasLockPrefix = 1; } +class REP { bit hasREPPrefix = 1; } +class TB { Map OpMap = TB; } +class T8 { Map OpMap = T8; } +class TA { Map OpMap = TA; } +class XOP8 { Map OpMap = XOP8; Prefix OpPrefix = PS; } +class XOP9 { Map OpMap = XOP9; Prefix OpPrefix = PS; } +class XOPA { Map OpMap = XOPA; Prefix OpPrefix = PS; } +class OBXS { Prefix OpPrefix = XS; } +class PS : TB { Prefix OpPrefix = PS; } +class PD : TB { Prefix OpPrefix = PD; } +class XD : TB { Prefix OpPrefix = XD; } +class XS : TB { Prefix OpPrefix = XS; } +class T8PS : T8 { Prefix OpPrefix = PS; } +class T8PD : T8 { Prefix OpPrefix = PD; } +class T8XD : T8 { Prefix OpPrefix = XD; } +class T8XS : T8 { Prefix OpPrefix = XS; } +class TAPS : TA { Prefix OpPrefix = PS; } +class TAPD : TA { Prefix OpPrefix = PD; } +class TAXD : TA { Prefix OpPrefix = XD; } +class VEX { Encoding OpEnc = EncVEX; } +class VEX_W { bit hasVEX_WPrefix = 1; } +class VEX_4V : VEX { bit hasVEX_4V = 1; } +class VEX_4VOp3 : VEX { bit hasVEX_4VOp3 = 1; } +class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; } +class VEX_L { bit hasVEX_L = 1; } +class VEX_LIG { bit ignoresVEX_L = 1; } +class EVEX : VEX { Encoding OpEnc = EncEVEX; } +class EVEX_4V : VEX_4V { Encoding OpEnc = EncEVEX; } +class EVEX_K { bit hasEVEX_K = 1; } +class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; } +class EVEX_B { bit hasEVEX_B = 1; } +class EVEX_RC { bit hasEVEX_RC = 1; } +class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; } +class EVEX_V256 { bit hasEVEX_L2 = 0; bit hasVEX_L = 1; } +class EVEX_V128 { bit hasEVEX_L2 = 0; bit hasVEX_L = 0; } + +// Specify AVX512 8-bit compressed displacement encoding based on the vector +// element size in bits (8, 16, 32, 64) and the CDisp8 form. +class EVEX_CD8<int esize, CD8VForm form> { + int CD8_EltSize = !srl(esize, 3); + bits<3> CD8_Form = form.Value; +} + +class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; } +class MemOp4 { bit hasMemOp4Prefix = 1; } +class XOP { Encoding OpEnc = EncXOP; } +class XOP_4V : XOP { bit hasVEX_4V = 1; } +class XOP_4VOp3 : XOP { bit hasVEX_4VOp3 = 1; } + +class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, + string AsmStr, + InstrItinClass itin, + Domain d = GenericDomain> + : Instruction { + let Namespace = "X86"; + + bits<8> Opcode = opcod; + Format Form = f; + bits<7> FormBits = Form.Value; + ImmType ImmT = i; + + dag OutOperandList = outs; + dag InOperandList = ins; + string AsmString = AsmStr; + + // If this is a pseudo instruction, mark it isCodeGenOnly. + let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo"); + + let Itinerary = itin; + + // + // Attributes specific to X86 instructions... + // + bit ForceDisassemble = 0; // Force instruction to disassemble even though it's + // isCodeGenonly. Needed to hide an ambiguous + // AsmString from the parser, but still disassemble. + + OperandSize OpSize = OpSizeFixed; // Does this instruction's encoding change + // based on operand size of the mode? + bits<2> OpSizeBits = OpSize.Value; + AddressSize AdSize = AdSizeX; // Does this instruction's encoding change + // based on address size of the mode? + bits<2> AdSizeBits = AdSize.Value; + + Prefix OpPrefix = NoPrfx; // Which prefix byte does this inst have? + bits<3> OpPrefixBits = OpPrefix.Value; + Map OpMap = OB; // Which opcode map does this inst have? + bits<3> OpMapBits = OpMap.Value; + bit hasREX_WPrefix = 0; // Does this inst require the REX.W prefix? + FPFormat FPForm = NotFP; // What flavor of FP instruction is this? + bit hasLockPrefix = 0; // Does this inst have a 0xF0 prefix? + Domain ExeDomain = d; + bit hasREPPrefix = 0; // Does this inst have a REP prefix? + Encoding OpEnc = EncNormal; // Encoding used by this instruction + bits<2> OpEncBits = OpEnc.Value; + bit hasVEX_WPrefix = 0; // Does this inst set the VEX_W field? + bit hasVEX_4V = 0; // Does this inst require the VEX.VVVV field? + bit hasVEX_4VOp3 = 0; // Does this inst require the VEX.VVVV field to + // encode the third operand? + bit hasVEX_i8ImmReg = 0; // Does this inst require the last source register + // to be encoded in a immediate field? + bit hasVEX_L = 0; // Does this inst use large (256-bit) registers? + bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit + bit hasEVEX_K = 0; // Does this inst require masking? + bit hasEVEX_Z = 0; // Does this inst set the EVEX_Z field? + bit hasEVEX_L2 = 0; // Does this inst set the EVEX_L2 field? + bit hasEVEX_B = 0; // Does this inst set the EVEX_B field? + bits<3> CD8_Form = 0; // Compressed disp8 form - vector-width. + // Declare it int rather than bits<4> so that all bits are defined when + // assigning to bits<7>. + int CD8_EltSize = 0; // Compressed disp8 form - element-size in bytes. + bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding? + bit hasMemOp4Prefix = 0; // Same bit as VEX_W, but used for swapping operands + bit hasEVEX_RC = 0; // Explicitly specified rounding control in FP instruction. + + bits<2> EVEX_LL; + let EVEX_LL{0} = hasVEX_L; + let EVEX_LL{1} = hasEVEX_L2; + // Vector size in bytes. + bits<7> VectSize = !shl(16, EVEX_LL); + + // The scaling factor for AVX512's compressed displacement is either + // - the size of a power-of-two number of elements or + // - the size of a single element for broadcasts or + // - the total vector size divided by a power-of-two number. + // Possible values are: 0 (non-AVX512 inst), 1, 2, 4, 8, 16, 32 and 64. + bits<7> CD8_Scale = !if (!eq (OpEnc.Value, EncEVEX.Value), + !if (CD8_Form{2}, + !shl(CD8_EltSize, CD8_Form{1-0}), + !if (hasEVEX_B, + CD8_EltSize, + !srl(VectSize, CD8_Form{1-0}))), 0); + + // TSFlags layout should be kept in sync with X86BaseInfo.h. + let TSFlags{6-0} = FormBits; + let TSFlags{8-7} = OpSizeBits; + let TSFlags{10-9} = AdSizeBits; + let TSFlags{13-11} = OpPrefixBits; + let TSFlags{16-14} = OpMapBits; + let TSFlags{17} = hasREX_WPrefix; + let TSFlags{21-18} = ImmT.Value; + let TSFlags{24-22} = FPForm.Value; + let TSFlags{25} = hasLockPrefix; + let TSFlags{26} = hasREPPrefix; + let TSFlags{28-27} = ExeDomain.Value; + let TSFlags{30-29} = OpEncBits; + let TSFlags{38-31} = Opcode; + let TSFlags{39} = hasVEX_WPrefix; + let TSFlags{40} = hasVEX_4V; + let TSFlags{41} = hasVEX_4VOp3; + let TSFlags{42} = hasVEX_i8ImmReg; + let TSFlags{43} = hasVEX_L; + let TSFlags{44} = ignoresVEX_L; + let TSFlags{45} = hasEVEX_K; + let TSFlags{46} = hasEVEX_Z; + let TSFlags{47} = hasEVEX_L2; + let TSFlags{48} = hasEVEX_B; + // If we run out of TSFlags bits, it's possible to encode this in 3 bits. + let TSFlags{55-49} = CD8_Scale; + let TSFlags{56} = has3DNow0F0FOpcode; + let TSFlags{57} = hasMemOp4Prefix; + let TSFlags{58} = hasEVEX_RC; +} + +class PseudoI<dag oops, dag iops, list<dag> pattern> + : X86Inst<0, Pseudo, NoImm, oops, iops, "", NoItinerary> { + let Pattern = pattern; +} + +class I<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary, + Domain d = GenericDomain> + : X86Inst<o, f, NoImm, outs, ins, asm, itin, d> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary, + Domain d = GenericDomain> + : X86Inst<o, f, Imm8, outs, ins, asm, itin, d> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm16, outs, ins, asm, itin> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm32, outs, ins, asm, itin> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii32S<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm32S, outs, ins, asm, itin> { + let Pattern = pattern; + let CodeSize = 3; +} + +class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm16PCRel, outs, ins, asm, itin> { + let Pattern = pattern; + let CodeSize = 3; +} + +class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm32PCRel, outs, ins, asm, itin> { + let Pattern = pattern; + let CodeSize = 3; +} + +// FPStack Instruction Templates: +// FPI - Floating Point Instruction template. +class FPI<bits<8> o, Format F, dag outs, dag ins, string asm, + InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, [], itin> {} + +// FpI_ - Floating Point Pseudo Instruction template. Not Predicated. +class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern, + InstrItinClass itin = NoItinerary> + : X86Inst<0, Pseudo, NoImm, outs, ins, "", itin> { + let FPForm = fp; + let Pattern = pattern; +} + +// Templates for instructions that use a 16- or 32-bit segmented address as +// their only operand: lcall (FAR CALL) and ljmp (FAR JMP) +// +// Iseg16 - 16-bit segment selector, 16-bit offset +// Iseg32 - 16-bit segment selector, 32-bit offset + +class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm16, outs, ins, asm, itin> { + let Pattern = pattern; + let CodeSize = 3; +} + +class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm32, outs, ins, asm, itin> { + let Pattern = pattern; + let CodeSize = 3; +} + +// SI - SSE 1 & 2 scalar instructions +class SI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary, + Domain d = GenericDomain> + : I<o, F, outs, ins, asm, pattern, itin, d> { + let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512], + !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX], + !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1], + !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2], + !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2], + [UseSSE1]))))); + + // AVX instructions have a 'v' prefix in the mnemonic + let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm), + !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm), + asm)); +} + +// SI - SSE 1 & 2 scalar intrinsics - vex form available on AVX512 +class SI_Int<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary, + Domain d = GenericDomain> + : I<o, F, outs, ins, asm, pattern, itin, d> { + let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512], + !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX], + !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1], + !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2], + !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2], + [UseSSE1]))))); + + // AVX instructions have a 'v' prefix in the mnemonic + let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm), + !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm), + asm)); +} +// SIi8 - SSE 1 & 2 scalar instructions - vex form available on AVX512 +class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin> { + let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512], + !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX], + !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1], + [UseSSE2]))); + + // AVX instructions have a 'v' prefix in the mnemonic + let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm), + !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm), + asm)); +} + +// PI - SSE 1 & 2 packed instructions +class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, + InstrItinClass itin, Domain d> + : I<o, F, outs, ins, asm, pattern, itin, d> { + let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512], + !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX], + !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2], + [UseSSE1]))); + + // AVX instructions have a 'v' prefix in the mnemonic + let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm), + !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm), + asm)); +} + +// MMXPI - SSE 1 & 2 packed instructions with MMX operands +class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, + InstrItinClass itin, Domain d> + : I<o, F, outs, ins, asm, pattern, itin, d> { + let Predicates = !if(!eq(OpPrefix.Value, PD.Value), [HasSSE2], + [HasSSE1]); +} + +// PIi8 - SSE 1 & 2 packed instructions with immediate +class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin, Domain d> + : Ii8<o, F, outs, ins, asm, pattern, itin, d> { + let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512], + !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX], + !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2], + [UseSSE1]))); + + // AVX instructions have a 'v' prefix in the mnemonic + let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm), + !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm), + asm)); +} + +// SSE1 Instruction Templates: +// +// SSI - SSE1 instructions with XS prefix. +// PSI - SSE1 instructions with PS prefix. +// PSIi8 - SSE1 instructions with ImmT == Imm8 and PS prefix. +// VSSI - SSE1 instructions with XS prefix in AVX form. +// VPSI - SSE1 instructions with PS prefix in AVX form, packed single. + +class SSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>; +class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>; +class PSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS, + Requires<[UseSSE1]>; +class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS, + Requires<[UseSSE1]>; +class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS, + Requires<[HasAVX]>; +class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedSingle>, PS, + Requires<[HasAVX]>; + +// SSE2 Instruction Templates: +// +// SDI - SSE2 instructions with XD prefix. +// SDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix. +// S2SI - SSE2 instructions with XS prefix. +// SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix. +// PDI - SSE2 instructions with PD prefix, packed double domain. +// PDIi8 - SSE2 instructions with ImmT == Imm8 and PD prefix. +// VSDI - SSE2 scalar instructions with XD prefix in AVX form. +// VPDI - SSE2 vector instructions with PD prefix in AVX form, +// packed double domain. +// VS2I - SSE2 scalar instructions with PD prefix in AVX form. +// S2I - SSE2 scalar instructions with PD prefix. +// MMXSDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix as well as +// MMX operands. +// MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as +// MMX operands. + +class SDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>; +class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>; +class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE2]>; +class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>; +class PDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD, + Requires<[UseSSE2]>; +class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD, + Requires<[UseSSE2]>; +class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD, + Requires<[UseAVX]>; +class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS, + Requires<[HasAVX]>; +class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>, + PD, Requires<[HasAVX]>; +class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, PD, + Requires<[UseAVX]>; +class S2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[UseSSE2]>; +class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>; +class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>; + +// SSE3 Instruction Templates: +// +// S3I - SSE3 instructions with PD prefixes. +// S3SI - SSE3 instructions with XS prefix. +// S3DI - SSE3 instructions with XD prefix. + +class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS, + Requires<[UseSSE3]>; +class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD, + Requires<[UseSSE3]>; +class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD, + Requires<[UseSSE3]>; + + +// SSSE3 Instruction Templates: +// +// SS38I - SSSE3 instructions with T8 prefix. +// SS3AI - SSSE3 instructions with TA prefix. +// MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands. +// MMXSS3AI - SSSE3 instructions with TA prefix and MMX operands. +// +// Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version +// uses the MMX registers. The 64-bit versions are grouped with the MMX +// classes. They need to be enabled even if AVX is enabled. + +class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD, + Requires<[UseSSSE3]>; +class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, + Requires<[UseSSSE3]>; +class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PS, + Requires<[HasSSSE3]>; +class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPS, + Requires<[HasSSSE3]>; + +// SSE4.1 Instruction Templates: +// +// SS48I - SSE 4.1 instructions with T8 prefix. +// SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8. +// +class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD, + Requires<[UseSSE41]>; +class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, + Requires<[UseSSE41]>; + +// SSE4.2 Instruction Templates: +// +// SS428I - SSE 4.2 instructions with T8 prefix. +class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD, + Requires<[UseSSE42]>; + +// SS42FI - SSE 4.2 instructions with T8XD prefix. +// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns. +class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>; + +// SS42AI = SSE 4.2 instructions with TA prefix +class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, + Requires<[UseSSE42]>; + +// AVX Instruction Templates: +// Instructions introduced in AVX (no SSE equivalent forms) +// +// AVX8I - AVX instructions with T8PD prefix. +// AVXAIi8 - AVX instructions with TAPD prefix and ImmT = Imm8. +class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD, + Requires<[HasAVX]>; +class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, + Requires<[HasAVX]>; + +// AVX2 Instruction Templates: +// Instructions introduced in AVX2 (no SSE equivalent forms) +// +// AVX28I - AVX2 instructions with T8PD prefix. +// AVX2AIi8 - AVX2 instructions with TAPD prefix and ImmT = Imm8. +class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD, + Requires<[HasAVX2]>; +class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, + Requires<[HasAVX2]>; + + +// AVX-512 Instruction Templates: +// Instructions introduced in AVX-512 (no SSE equivalent forms) +// +// AVX5128I - AVX-512 instructions with T8PD prefix. +// AVX512AIi8 - AVX-512 instructions with TAPD prefix and ImmT = Imm8. +// AVX512PDI - AVX-512 instructions with PD, double packed. +// AVX512PSI - AVX-512 instructions with PS, single packed. +// AVX512XS8I - AVX-512 instructions with T8 and XS prefixes. +// AVX512XSI - AVX-512 instructions with XS prefix, generic domain. +// AVX512BI - AVX-512 instructions with PD, int packed domain. +// AVX512SI - AVX-512 scalar instructions with PD prefix. + +class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD, + Requires<[HasAVX512]>; +class AVX5128IBase : T8PD { + Domain ExeDomain = SSEPackedInt; +} +class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8XS, + Requires<[HasAVX512]>; +class AVX512XSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, XS, + Requires<[HasAVX512]>; +class AVX512XDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, XD, + Requires<[HasAVX512]>; +class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD, + Requires<[HasAVX512]>; +class AVX512BIBase : PD { + Domain ExeDomain = SSEPackedInt; +} +class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD, + Requires<[HasAVX512]>; +class AVX512BIi8Base : PD { + Domain ExeDomain = SSEPackedInt; + ImmType ImmT = Imm8; +} +class AVX512XSIi8Base : XS { + Domain ExeDomain = SSEPackedInt; + ImmType ImmT = Imm8; +} +class AVX512XDIi8Base : XD { + Domain ExeDomain = SSEPackedInt; + ImmType ImmT = Imm8; +} +class AVX512PSIi8Base : PS { + Domain ExeDomain = SSEPackedSingle; + ImmType ImmT = Imm8; +} +class AVX512PDIi8Base : PD { + Domain ExeDomain = SSEPackedDouble; + ImmType ImmT = Imm8; +} +class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, + Requires<[HasAVX512]>; +class AVX512AIi8Base : TAPD { + Domain ExeDomain = SSEPackedInt; + ImmType ImmT = Imm8; +} +class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, + Requires<[HasAVX512]>; +class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, PD, + Requires<[HasAVX512]>; +class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, PS, + Requires<[HasAVX512]>; +class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>; +class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>; +class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, T8PD, + EVEX_4V, Requires<[HasAVX512]>; +class AVX512FMA3Base : T8PD, EVEX_4V; + +class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, Requires<[HasAVX512]>; + +// AES Instruction Templates: +// +// AES8I +// These use the same encoding as the SSE4.2 T8 and TA encodings. +class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = IIC_AES> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD, + Requires<[HasAES]>; + +class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, + Requires<[HasAES]>; + +// PCLMUL Instruction Templates +class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, + Requires<[HasPCLMUL]>; + +class AVXPCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, + VEX_4V, Requires<[HasAVX, HasPCLMUL]>; + +// FMA3 Instruction Templates +class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, T8PD, + VEX_4V, FMASC, Requires<[HasFMA]>; + +// FMA4 Instruction Templates +class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin>, TAPD, + VEX_4V, VEX_I8IMM, FMASC, Requires<[HasFMA4]>; + +// XOP 2, 3 and 4 Operand Instruction Template +class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, + XOP9, Requires<[HasXOP]>; + +// XOP 2, 3 and 4 Operand Instruction Templates with imm byte +class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, + XOP8, Requires<[HasXOP]>; + +// XOP 5 operand instruction (VEX encoding!) +class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, + VEX_4V, VEX_I8IMM, Requires<[HasXOP]>; + +// X86-64 Instruction templates... +// + +class RI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, REX_W; +class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin>, REX_W; +class RIi16 <bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii16<o, F, outs, ins, asm, pattern, itin>, REX_W; +class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii32<o, F, outs, ins, asm, pattern, itin>, REX_W; +class RIi32S <bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii32S<o, F, outs, ins, asm, pattern, itin>, REX_W; + +class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm64, outs, ins, asm, itin>, REX_W { + let Pattern = pattern; + let CodeSize = 3; +} + +class RIi64_NOREX<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : X86Inst<o, f, Imm64, outs, ins, asm, itin> { + let Pattern = pattern; + let CodeSize = 3; +} + +class RS2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : S2I<o, F, outs, ins, asm, pattern, itin>, REX_W; +class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : VS2I<o, F, outs, ins, asm, pattern, itin>, VEX_W; + +// MMX Instruction templates +// + +// MMXI - MMX instructions with TB prefix. +// MMXI32 - MMX instructions with TB prefix valid only in 32 bit mode. +// MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode. +// MMX2I - MMX / SSE2 instructions with PD prefix. +// MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix. +// MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix. +// MMXID - MMX instructions with XD prefix. +// MMXIS - MMX instructions with XS prefix. +class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>; +class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,Not64BitMode]>; +class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,In64BitMode]>; +class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, PS, REX_W, Requires<[HasMMX]>; +class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[HasMMX]>; +class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>; +class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasMMX]>; +class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, InstrItinClass itin = NoItinerary> + : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasMMX]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td new file mode 100644 index 0000000..829cedd --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -0,0 +1,1037 @@ +//===-- X86InstrFragmentsSIMD.td - x86 SIMD ISA ------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides pattern fragments useful for SIMD instructions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// MMX specific DAG Nodes. +//===----------------------------------------------------------------------===// + +// Low word of MMX to GPR. +def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1, + [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>; +// GPR to low word of MMX. +def MMX_X86movw2d : SDNode<"X86ISD::MMX_MOVW2D", SDTypeProfile<1, 1, + [SDTCisVT<0, x86mmx>, SDTCisVT<1, i32>]>>; + +//===----------------------------------------------------------------------===// +// MMX Pattern Fragments +//===----------------------------------------------------------------------===// + +def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>; +def load_mvmmx : PatFrag<(ops node:$ptr), + (x86mmx (MMX_X86movw2d (load node:$ptr)))>; +def bc_mmx : PatFrag<(ops node:$in), (x86mmx (bitconvert node:$in))>; + +//===----------------------------------------------------------------------===// +// SSE specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, + SDTCisFP<1>, SDTCisVT<3, i8>, + SDTCisVec<1>]>; +def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, + SDTCisSameAs<1, 2>, SDTCisInt<3>]>; + +def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>; +def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>; + +// Commutative and Associative FMIN and FMAX. +def X86fminc : SDNode<"X86ISD::FMINC", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def X86fmaxc : SDNode<"X86ISD::FMAXC", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; + +def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; +def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; +def X86frsqrt14s: SDNode<"X86ISD::FRSQRT", SDTFPBinOp>; +def X86frcp14s : SDNode<"X86ISD::FRCP", SDTFPBinOp>; +def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>; +def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>; +def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>; +def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>; +def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>; +def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; +def X86comiSae : SDNode<"X86ISD::COMI", SDTX86CmpTestSae>; +def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; +def X86ucomiSae: SDNode<"X86ISD::UCOMI", SDTX86CmpTestSae>; +def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>; +//def X86cmpsd : SDNode<"X86ISD::FSETCCsd", SDTX86Cmpsd>; +def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD", + SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>, + SDTCisVT<1, v4i32>]>>; +def X86cvtudq2pd: SDNode<"X86ISD::CVTUDQ2PD", + SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>, + SDTCisVT<1, v4i32>]>>; +def X86pshufb : SDNode<"X86ISD::PSHUFB", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i8>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86psadbw : SDNode<"X86ISD::PSADBW", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, + SDTCVecEltisVT<1, i8>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>>; +def X86dbpsadbw : SDNode<"X86ISD::DBPSADBW", + SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, i8>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>, SDTCisInt<3>]>>; +def X86andnp : SDNode<"X86ISD::ANDNP", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86psign : SDNode<"X86ISD::PSIGN", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86pextrb : SDNode<"X86ISD::PEXTRB", + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v16i8>, + SDTCisPtrTy<2>]>>; +def X86pextrw : SDNode<"X86ISD::PEXTRW", + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v8i16>, + SDTCisPtrTy<2>]>>; +def X86pinsrb : SDNode<"X86ISD::PINSRB", + SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, + SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; +def X86pinsrw : SDNode<"X86ISD::PINSRW", + SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>, + SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; +def X86insertps : SDNode<"X86ISD::INSERTPS", + SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>, + SDTCisVT<2, v4f32>, SDTCisVT<3, i8>]>>; +def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL", + SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>; + +def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +def X86vzext : SDNode<"X86ISD::VZEXT", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisInt<1>, + SDTCisOpSmallerThanOp<1, 0>]>>; + +def X86vsext : SDNode<"X86ISD::VSEXT", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisInt<1>, + SDTCisOpSmallerThanOp<1, 0>]>>; + +def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisInt<1>, + SDTCisOpSmallerThanOp<0, 1>]>; + +def X86vtrunc : SDNode<"X86ISD::VTRUNC", SDTVtrunc>; +def X86vtruncs : SDNode<"X86ISD::VTRUNCS", SDTVtrunc>; +def X86vtruncus : SDNode<"X86ISD::VTRUNCUS", SDTVtrunc>; + +def X86trunc : SDNode<"X86ISD::TRUNC", + SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisOpSmallerThanOp<0, 1>]>>; +def X86vfpext : SDNode<"X86ISD::VFPEXT", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCisFP<1>, + SDTCisOpSmallerThanOp<1, 0>]>>; +def X86vfpround: SDNode<"X86ISD::VFPROUND", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCisFP<1>, + SDTCisOpSmallerThanOp<0, 1>]>>; + +def X86fround: SDNode<"X86ISD::VFPROUND", + SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>, + SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, f64>, + SDTCVecEltisVT<2, f64>, + SDTCisOpSmallerThanOp<0, 1>]>>; +def X86froundRnd: SDNode<"X86ISD::VFPROUND", + SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>, + SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, f64>, + SDTCVecEltisVT<2, f64>, + SDTCisOpSmallerThanOp<0, 1>, + SDTCisInt<3>]>>; + +def X86fpext : SDNode<"X86ISD::VFPEXT", + SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>, + SDTCVecEltisVT<0, f64>, + SDTCVecEltisVT<1, f32>, + SDTCVecEltisVT<2, f32>, + SDTCisOpSmallerThanOp<1, 0>]>>; + +def X86fpextRnd : SDNode<"X86ISD::VFPEXT", + SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>, + SDTCVecEltisVT<0, f64>, + SDTCVecEltisVT<1, f32>, + SDTCVecEltisVT<2, f32>, + SDTCisOpSmallerThanOp<1, 0>, + SDTCisInt<3>]>>; + +def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>; +def X86vshrdq : SDNode<"X86ISD::VSRLDQ", SDTIntShiftOp>; +def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>; +def X86pcmpeq : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>; +def X86pcmpgt : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>; + +def X86IntCmpMask : SDTypeProfile<1, 2, + [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisInt<1>]>; +def X86pcmpeqm : SDNode<"X86ISD::PCMPEQM", X86IntCmpMask, [SDNPCommutative]>; +def X86pcmpgtm : SDNode<"X86ISD::PCMPGTM", X86IntCmpMask>; + +def X86CmpMaskCC : + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, + SDTCisVec<1>, SDTCisSameAs<2, 1>, + SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>; +def X86CmpMaskCCRound : + SDTypeProfile<1, 4, [SDTCisVec<0>,SDTCVecEltisVT<0, i1>, + SDTCisVec<1>, SDTCisSameAs<2, 1>, + SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>, + SDTCisInt<4>]>; +def X86CmpMaskCCScalar : + SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; + +def X86CmpMaskCCScalarRound : + SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>, + SDTCisInt<4>]>; + +def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; +def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>; +def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>; +def X86cmpms : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalar>; +def X86cmpmsRnd : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalarRound>; + +def X86vshl : SDNode<"X86ISD::VSHL", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVec<2>]>>; +def X86vsrl : SDNode<"X86ISD::VSRL", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVec<2>]>>; +def X86vsra : SDNode<"X86ISD::VSRA", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVec<2>]>>; + +def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>; +def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>; +def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>; + +def X86vprot : SDNode<"X86ISD::VPROT", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86vproti : SDNode<"X86ISD::VPROTI", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVT<2, i8>]>>; + +def X86vpshl : SDNode<"X86ISD::VPSHL", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86vpsha : SDNode<"X86ISD::VPSHA", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; + +def X86vpcom : SDNode<"X86ISD::VPCOM", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisVT<3, i8>]>>; +def X86vpcomu : SDNode<"X86ISD::VPCOMU", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisVT<3, i8>]>>; + +def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, + SDTCisVec<1>, + SDTCisSameAs<2, 1>]>; +def X86addus : SDNode<"X86ISD::ADDUS", SDTIntBinOp>; +def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>; +def X86adds : SDNode<"X86ISD::ADDS", SDTIntBinOp>; +def X86subs : SDNode<"X86ISD::SUBS", SDTIntBinOp>; +def X86mulhrs : SDNode<"X86ISD::MULHRS" , SDTIntBinOp>; +def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp>; +def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; +def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; +def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>; +def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>; +def X86testm : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>, + SDTCisVec<1>, SDTCisSameAs<2, 1>, + SDTCVecEltisVT<0, i1>, + SDTCisSameNumEltsAs<0, 1>]>>; +def X86testnm : SDNode<"X86ISD::TESTNM", SDTypeProfile<1, 2, [SDTCisVec<0>, + SDTCisVec<1>, SDTCisSameAs<2, 1>, + SDTCVecEltisVT<0, i1>, + SDTCisSameNumEltsAs<0, 1>]>>; +def X86select : SDNode<"X86ISD::SELECT" , SDTSelect>; + +def X86pmuludq : SDNode<"X86ISD::PMULUDQ", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, + SDTCVecEltisVT<1, i32>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>>; +def X86pmuldq : SDNode<"X86ISD::PMULDQ", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, + SDTCVecEltisVT<1, i32>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>>; + +def X86extrqi : SDNode<"X86ISD::EXTRQI", + SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>, + SDTCisVT<2, i8>, SDTCisVT<3, i8>]>>; +def X86insertqi : SDNode<"X86ISD::INSERTQI", + SDTypeProfile<1, 4, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>, SDTCisVT<3, i8>, + SDTCisVT<4, i8>]>>; + +// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get +// translated into one of the target nodes below during lowering. +// Note: this is a work in progress... +def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; +def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>; + +def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameSizeAs<0,2>, + SDTCisSameNumEltsAs<0,2>]>; +def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>, + SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>; +def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>; +def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, SDTCisInt<3>, SDTCisInt<4>]>; +def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisInt<2>, SDTCisInt<3>]>; + +def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; +def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>, + SDTCisInt<0>, SDTCisInt<1>]>; + +def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>; + +def SDTTernlog : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, SDTCisSameAs<0,3>, + SDTCisVT<4, i8>]>; + +def SDTFPBinOpRound : SDTypeProfile<1, 3, [ // fadd_round, fmul_round, etc. + SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisInt<3>]>; + +def SDTFPUnaryOpRound : SDTypeProfile<1, 2, [ // fsqrt_round, fgetexp_round, etc. + SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]>; + +def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>; +def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>, SDTCisSameAs<1,3>, SDTCisInt<4>]>; +def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, + SDTCisVec<0>, SDTCisVT<2, i32>]>; +def STDFp2SrcRm : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, + SDTCisVec<0>, SDTCisVT<3, i32>]>; +def STDFp3SrcRm : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>, + SDTCisVec<0>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>; + +def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>; +def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>; + +def X86Abs : SDNode<"X86ISD::ABS", SDTIntUnaryOp>; +def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>; + +def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>; +def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>; +def X86PShuflw : SDNode<"X86ISD::PSHUFLW", SDTShuff2OpI>; + +def X86Shufp : SDNode<"X86ISD::SHUFP", SDTShuff3OpI>; +def X86Shuf128 : SDNode<"X86ISD::SHUF128", SDTShuff3OpI>; + +def X86Movddup : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>; +def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>; +def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>; + +def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2Op>; +def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>; + +def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>; +def X86Movlhpd : SDNode<"X86ISD::MOVLHPD", SDTShuff2Op>; +def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>; + +def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>; +def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>; + +def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>; +def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>; +def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>; + +def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>; +def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; + +def X86vpmaddubsw : SDNode<"X86ISD::VPMADDUBSW" , SDTPack>; +def X86vpmaddwd : SDNode<"X86ISD::VPMADDWD" , SDTPack>; + +def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>; +def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>; +def X86VPermv : SDNode<"X86ISD::VPERMV", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<1>, + SDTCisSameNumEltsAs<0,1>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>; +def X86VPermt2 : SDNode<"X86ISD::VPERMV3", + SDTypeProfile<1, 3, [SDTCisVec<0>, + SDTCisSameAs<0,1>, SDTCisInt<2>, + SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>, + SDTCisSameSizeAs<0,2>, + SDTCisSameAs<0,3>]>, []>; + +def X86VPermi2X : SDNode<"X86ISD::VPERMIV3", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>, + SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisSameAs<0,3>]>, []>; + +def X86vpternlog : SDNode<"X86ISD::VPTERNLOG", SDTTernlog>; + +def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; + +def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPBinOpImmRound>; +def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImmRound>; +def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImmRound>; +def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImmRound>; +def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImmRound>; +def X86Vfpclass : SDNode<"X86ISD::VFPCLASS", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, + SDTCisVec<1>, SDTCisFP<1>, + SDTCisSameNumEltsAs<0,1>, + SDTCisVT<2, i32>]>, []>; +def X86Vfpclasss : SDNode<"X86ISD::VFPCLASSS", + SDTypeProfile<1, 2, [SDTCisVT<0, i1>, + SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>; + +def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSubVecOfVec<1, 0>]>, []>; +// SDTCisSubVecOfVec restriction cannot be applied for 128 bit version of VBROADCASTI32x2. +def X86SubV32x2Broadcast : SDNode<"X86ISD::SUBV_BROADCAST", + SDTypeProfile<1, 1, [SDTCisVec<0>, + SDTCisSameAs<0,1>]>, []>; + +def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; +def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; +def X86Vinsert : SDNode<"X86ISD::VINSERT", SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisEltOfVec<2, 1>, + SDTCisPtrTy<3>]>, []>; +def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2, + [SDTCisEltOfVec<0, 1>, SDTCisVec<1>, + SDTCisPtrTy<2>]>, []>; + +def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; + +def X86Addsub : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>; + +def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>; +def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>; +def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>; +def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>; +def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>; +def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>; +def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>; +def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>; +def X86fsqrtRnds : SDNode<"X86ISD::FSQRT_RND", STDFp2SrcRm>; +def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>; +def X86fgetexpRnds : SDNode<"X86ISD::FGETEXP_RND", STDFp2SrcRm>; + +def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; +def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; +def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFma>; +def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFma>; +def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFma>; +def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFma>; + +def X86FmaddRnd : SDNode<"X86ISD::FMADD_RND", SDTFmaRound>; +def X86FnmaddRnd : SDNode<"X86ISD::FNMADD_RND", SDTFmaRound>; +def X86FmsubRnd : SDNode<"X86ISD::FMSUB_RND", SDTFmaRound>; +def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound>; +def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound>; +def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound>; + +def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", STDFp1SrcRm>; +def X86rcp28 : SDNode<"X86ISD::RCP28", STDFp1SrcRm>; +def X86exp2 : SDNode<"X86ISD::EXP2", STDFp1SrcRm>; + +def X86rsqrt28s : SDNode<"X86ISD::RSQRT28", STDFp2SrcRm>; +def X86rcp28s : SDNode<"X86ISD::RCP28", STDFp2SrcRm>; +def X86RndScales : SDNode<"X86ISD::VRNDSCALE", STDFp3SrcRm>; +def X86Reduces : SDNode<"X86ISD::VREDUCE", STDFp3SrcRm>; +def X86GetMants : SDNode<"X86ISD::VGETMANT", STDFp3SrcRm>; + +def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>, + SDTCisVT<4, i8>]>; +def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, v16i8>, SDTCisVT<3, i32>, + SDTCisVT<4, v16i8>, SDTCisVT<5, i32>, + SDTCisVT<6, i8>]>; + +def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>; +def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>; + +def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1, + [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>; +def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1, + [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>; + +def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>, + SDTCisSameAs<0,1>, SDTCisInt<2>, + SDTCisVT<3, i32>]>; + +def SDTDoubleToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>; +def SDTFloatToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCVecEltisVT<1, f32>]>; + +def SDTDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>; +def SDTSDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>,SDTCisFP<1>, + SDTCVecEltisVT<1, f64>, SDTCisInt<2>]>; +def SDTFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCVecEltisVT<1, f32>]>; +def SDTSFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisFP<1>, + SDTCVecEltisVT<1, f32>, SDTCisInt<2>]>; +def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCVecEltisVT<1, i32>, + SDTCisInt<2>]>; +def SDTVlongToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCVecEltisVT<1, i64>, + SDTCisInt<2>]>; + +def SDTVFPToIntRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<1>, SDTCVecEltisVT<0, i32>, + SDTCisInt<2>]>; +def SDTVFPToLongRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<1>, SDTCVecEltisVT<0, i64>, + SDTCisInt<2>]>; + +// Scalar +def X86SintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTintToFPRound>; +def X86UintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTintToFPRound>; + +def X86cvttss2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTSFloatToIntRnd>; +def X86cvttss2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTSFloatToIntRnd>; +def X86cvttsd2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTSDoubleToIntRnd>; +def X86cvttsd2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTSDoubleToIntRnd>; +// Vector with rounding mode + +// cvtt fp-to-int staff +def X86VFpToSintRnd : SDNode<"ISD::FP_TO_SINT", SDTVFPToIntRound>; +def X86VFpToUintRnd : SDNode<"ISD::FP_TO_UINT", SDTVFPToIntRound>; +def X86VFpToSlongRnd : SDNode<"ISD::FP_TO_SINT", SDTVFPToLongRound>; +def X86VFpToUlongRnd : SDNode<"ISD::FP_TO_UINT", SDTVFPToLongRound>; + +def X86VSintToFpRnd : SDNode<"ISD::SINT_TO_FP", SDTVintToFPRound>; +def X86VUintToFpRnd : SDNode<"ISD::UINT_TO_FP", SDTVintToFPRound>; +def X86VSlongToFpRnd : SDNode<"ISD::SINT_TO_FP", SDTVlongToFPRound>; +def X86VUlongToFpRnd : SDNode<"ISD::UINT_TO_FP", SDTVlongToFPRound>; + +// cvt fp-to-int staff +def X86cvtps2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTFloatToIntRnd>; +def X86cvtps2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTFloatToIntRnd>; +def X86cvtpd2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTDoubleToIntRnd>; +def X86cvtpd2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTDoubleToIntRnd>; + +// Vector without rounding mode +def X86cvtps2Int : SDNode<"X86ISD::FP_TO_SINT_RND", SDTFloatToInt>; +def X86cvtps2UInt : SDNode<"X86ISD::FP_TO_UINT_RND", SDTFloatToInt>; +def X86cvtpd2Int : SDNode<"X86ISD::FP_TO_SINT_RND", SDTDoubleToInt>; +def X86cvtpd2UInt : SDNode<"X86ISD::FP_TO_UINT_RND", SDTDoubleToInt>; + +def X86cvtph2ps : SDNode<"ISD::FP16_TO_FP", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, i16>, + SDTCisFP<0>, + SDTCisVT<2, i32>]> >; + +def X86cvtps2ph : SDNode<"ISD::FP_TO_FP16", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, + SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, f32>, + SDTCisFP<1>, SDTCisVT<2, i32>, + SDTCisVT<3, i32>]> >; +def X86vfpextRnd : SDNode<"X86ISD::VFPEXT", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCisFP<1>, + SDTCVecEltisVT<0, f64>, + SDTCVecEltisVT<1, f32>, + SDTCisOpSmallerThanOp<1, 0>, + SDTCisVT<2, i32>]>>; +def X86vfproundRnd: SDNode<"X86ISD::VFPROUND", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCisFP<1>, + SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, f64>, + SDTCisOpSmallerThanOp<0, 1>, + SDTCisVT<2, i32>]>>; + +def X86cvt2mask : SDNode<"X86ISD::CVT2MASK", SDTIntTruncOp>; + +//===----------------------------------------------------------------------===// +// SSE Complex Patterns +//===----------------------------------------------------------------------===// + +// These are 'extloads' from a scalar to the low element of a vector, zeroing +// the top elements. These are used for the SSE 'ss' and 'sd' instruction +// forms. +def sse_load_f32 : ComplexPattern<v4f32, 5, "selectScalarSSELoad", [], + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, + SDNPWantRoot]>; +def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [], + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, + SDNPWantRoot]>; + +def ssmem : Operand<v4f32> { + let PrintMethod = "printf32mem"; + let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm); + let ParserMatchClass = X86Mem32AsmOperand; + let OperandType = "OPERAND_MEMORY"; +} +def sdmem : Operand<v2f64> { + let PrintMethod = "printf64mem"; + let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm); + let ParserMatchClass = X86Mem64AsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + +//===----------------------------------------------------------------------===// +// SSE pattern fragments +//===----------------------------------------------------------------------===// + +// 128-bit load pattern fragments +// NOTE: all 128-bit integer vector loads are promoted to v2i64 +def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>; +def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>; +def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>; + +// 256-bit load pattern fragments +// NOTE: all 256-bit integer vector loads are promoted to v4i64 +def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>; +def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>; +def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; + +// 512-bit load pattern fragments +def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>; +def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>; +def loadv64i8 : PatFrag<(ops node:$ptr), (v64i8 (load node:$ptr))>; +def loadv32i16 : PatFrag<(ops node:$ptr), (v32i16 (load node:$ptr))>; +def loadv16i32 : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>; +def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>; + +// 128-/256-/512-bit extload pattern fragments +def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>; +def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>; +def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>; + +// These are needed to match a scalar load that is used in a vector-only +// math instruction such as the FP logical ops: andps, andnps, orps, xorps. +// The memory operand is required to be a 128-bit load, so it must be converted +// from a vector to a scalar. +def loadf32_128 : PatFrag<(ops node:$ptr), + (f32 (extractelt (loadv4f32 node:$ptr), (iPTR 0)))>; +def loadf64_128 : PatFrag<(ops node:$ptr), + (f64 (extractelt (loadv2f64 node:$ptr), (iPTR 0)))>; + +// Like 'store', but always requires 128-bit vector alignment. +def alignedstore : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 16; +}]>; + +// Like 'store', but always requires 256-bit vector alignment. +def alignedstore256 : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 32; +}]>; + +// Like 'store', but always requires 512-bit vector alignment. +def alignedstore512 : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 64; +}]>; + +// Like 'load', but always requires 128-bit vector alignment. +def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() >= 16; +}]>; + +// Like 'X86vzload', but always requires 128-bit vector alignment. +def alignedX86vzload : PatFrag<(ops node:$ptr), (X86vzload node:$ptr), [{ + return cast<MemSDNode>(N)->getAlignment() >= 16; +}]>; + +// Like 'load', but always requires 256-bit vector alignment. +def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() >= 32; +}]>; + +// Like 'load', but always requires 512-bit vector alignment. +def alignedload512 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() >= 64; +}]>; + +def alignedloadfsf32 : PatFrag<(ops node:$ptr), + (f32 (alignedload node:$ptr))>; +def alignedloadfsf64 : PatFrag<(ops node:$ptr), + (f64 (alignedload node:$ptr))>; + +// 128-bit aligned load pattern fragments +// NOTE: all 128-bit integer vector loads are promoted to v2i64 +def alignedloadv4f32 : PatFrag<(ops node:$ptr), + (v4f32 (alignedload node:$ptr))>; +def alignedloadv2f64 : PatFrag<(ops node:$ptr), + (v2f64 (alignedload node:$ptr))>; +def alignedloadv2i64 : PatFrag<(ops node:$ptr), + (v2i64 (alignedload node:$ptr))>; + +// 256-bit aligned load pattern fragments +// NOTE: all 256-bit integer vector loads are promoted to v4i64 +def alignedloadv8f32 : PatFrag<(ops node:$ptr), + (v8f32 (alignedload256 node:$ptr))>; +def alignedloadv4f64 : PatFrag<(ops node:$ptr), + (v4f64 (alignedload256 node:$ptr))>; +def alignedloadv4i64 : PatFrag<(ops node:$ptr), + (v4i64 (alignedload256 node:$ptr))>; + +// 512-bit aligned load pattern fragments +def alignedloadv16f32 : PatFrag<(ops node:$ptr), + (v16f32 (alignedload512 node:$ptr))>; +def alignedloadv16i32 : PatFrag<(ops node:$ptr), + (v16i32 (alignedload512 node:$ptr))>; +def alignedloadv8f64 : PatFrag<(ops node:$ptr), + (v8f64 (alignedload512 node:$ptr))>; +def alignedloadv8i64 : PatFrag<(ops node:$ptr), + (v8i64 (alignedload512 node:$ptr))>; + +// Like 'load', but uses special alignment checks suitable for use in +// memory operands in most SSE instructions, which are required to +// be naturally aligned on some targets but not on others. If the subtarget +// allows unaligned accesses, match any load, though this may require +// setting a feature bit in the processor (on startup, for example). +// Opteron 10h and later implement such a feature. +def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return Subtarget->hasSSEUnalignedMem() + || cast<LoadSDNode>(N)->getAlignment() >= 16; +}]>; + +def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>; +def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>; + +// 128-bit memop pattern fragments +// NOTE: all 128-bit integer vector loads are promoted to v2i64 +def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>; +def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>; +def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>; + +// These are needed to match a scalar memop that is used in a vector-only +// math instruction such as the FP logical ops: andps, andnps, orps, xorps. +// The memory operand is required to be a 128-bit load, so it must be converted +// from a vector to a scalar. +def memopfsf32_128 : PatFrag<(ops node:$ptr), + (f32 (extractelt (memopv4f32 node:$ptr), (iPTR 0)))>; +def memopfsf64_128 : PatFrag<(ops node:$ptr), + (f64 (extractelt (memopv2f64 node:$ptr), (iPTR 0)))>; + + +// SSSE3 uses MMX registers for some instructions. They aren't aligned on a +// 16-byte boundary. +// FIXME: 8 byte alignment for mmx reads is not required +def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() >= 8; +}]>; + +def memopmmx : PatFrag<(ops node:$ptr), (x86mmx (memop64 node:$ptr))>; + +def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_gather node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i32 || + Mgt->getBasePtr().getValueType() == MVT::v4i32); + return false; +}]>; + +def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_gather node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v8i32 || + Mgt->getBasePtr().getValueType() == MVT::v8i32); + return false; +}]>; + +def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_gather node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v2i64 || + Mgt->getBasePtr().getValueType() == MVT::v2i64); + return false; +}]>; +def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_gather node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i64 || + Mgt->getBasePtr().getValueType() == MVT::v4i64); + return false; +}]>; +def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_gather node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v8i64 || + Mgt->getBasePtr().getValueType() == MVT::v8i64); + return false; +}]>; +def mgatherv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_gather node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v16i32 || + Mgt->getBasePtr().getValueType() == MVT::v16i32); + return false; +}]>; + +def mscatterv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_scatter node:$src1, node:$src2, node:$src3) , [{ + if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) + return (Sc->getIndex().getValueType() == MVT::v2i64 || + Sc->getBasePtr().getValueType() == MVT::v2i64); + return false; +}]>; + +def mscatterv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_scatter node:$src1, node:$src2, node:$src3) , [{ + if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) + return (Sc->getIndex().getValueType() == MVT::v4i32 || + Sc->getBasePtr().getValueType() == MVT::v4i32); + return false; +}]>; + +def mscatterv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_scatter node:$src1, node:$src2, node:$src3) , [{ + if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) + return (Sc->getIndex().getValueType() == MVT::v4i64 || + Sc->getBasePtr().getValueType() == MVT::v4i64); + return false; +}]>; + +def mscatterv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_scatter node:$src1, node:$src2, node:$src3) , [{ + if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) + return (Sc->getIndex().getValueType() == MVT::v8i32 || + Sc->getBasePtr().getValueType() == MVT::v8i32); + return false; +}]>; + +def mscatterv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_scatter node:$src1, node:$src2, node:$src3) , [{ + if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) + return (Sc->getIndex().getValueType() == MVT::v8i64 || + Sc->getBasePtr().getValueType() == MVT::v8i64); + return false; +}]>; +def mscatterv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_scatter node:$src1, node:$src2, node:$src3) , [{ + if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) + return (Sc->getIndex().getValueType() == MVT::v16i32 || + Sc->getBasePtr().getValueType() == MVT::v16i32); + return false; +}]>; + +// 128-bit bitconvert pattern fragments +def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>; +def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>; +def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>; +def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>; +def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>; +def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>; + +// 256-bit bitconvert pattern fragments +def bc_v32i8 : PatFrag<(ops node:$in), (v32i8 (bitconvert node:$in))>; +def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>; +def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>; +def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>; +def bc_v8f32 : PatFrag<(ops node:$in), (v8f32 (bitconvert node:$in))>; + +// 512-bit bitconvert pattern fragments +def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>; +def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>; +def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>; +def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>; + +def vzmovl_v2i64 : PatFrag<(ops node:$src), + (bitconvert (v2i64 (X86vzmovl + (v2i64 (scalar_to_vector (loadi64 node:$src))))))>; +def vzmovl_v4i32 : PatFrag<(ops node:$src), + (bitconvert (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector (loadi32 node:$src))))))>; + +def vzload_v2i64 : PatFrag<(ops node:$src), + (bitconvert (v2i64 (X86vzload node:$src)))>; + + +def fp32imm0 : PatLeaf<(f32 fpimm), [{ + return N->isExactlyValue(+0.0); +}]>; + +def I8Imm : SDNodeXForm<imm, [{ + // Transformation function: get the low 8 bits. + return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N)); +}]>; + +def FROUND_NO_EXC : ImmLeaf<i32, [{ return Imm == 8; }]>; +def FROUND_CURRENT : ImmLeaf<i32, [{ + return Imm == X86::STATIC_ROUNDING::CUR_DIRECTION; +}]>; + +// BYTE_imm - Transform bit immediates into byte immediates. +def BYTE_imm : SDNodeXForm<imm, [{ + // Transformation function: imm >> 3 + return getI32Imm(N->getZExtValue() >> 3, SDLoc(N)); +}]>; + +// EXTRACT_get_vextract128_imm xform function: convert extract_subvector index +// to VEXTRACTF128/VEXTRACTI128 imm. +def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{ + return getI8Imm(X86::getExtractVEXTRACT128Immediate(N), SDLoc(N)); +}]>; + +// INSERT_get_vinsert128_imm xform function: convert insert_subvector index to +// VINSERTF128/VINSERTI128 imm. +def INSERT_get_vinsert128_imm : SDNodeXForm<insert_subvector, [{ + return getI8Imm(X86::getInsertVINSERT128Immediate(N), SDLoc(N)); +}]>; + +// EXTRACT_get_vextract256_imm xform function: convert extract_subvector index +// to VEXTRACTF64x4 imm. +def EXTRACT_get_vextract256_imm : SDNodeXForm<extract_subvector, [{ + return getI8Imm(X86::getExtractVEXTRACT256Immediate(N), SDLoc(N)); +}]>; + +// INSERT_get_vinsert256_imm xform function: convert insert_subvector index to +// VINSERTF64x4 imm. +def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{ + return getI8Imm(X86::getInsertVINSERT256Immediate(N), SDLoc(N)); +}]>; + +def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index), + (extract_subvector node:$bigvec, + node:$index), [{ + return X86::isVEXTRACT128Index(N); +}], EXTRACT_get_vextract128_imm>; + +def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec, + node:$index), + (insert_subvector node:$bigvec, node:$smallvec, + node:$index), [{ + return X86::isVINSERT128Index(N); +}], INSERT_get_vinsert128_imm>; + + +def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index), + (extract_subvector node:$bigvec, + node:$index), [{ + return X86::isVEXTRACT256Index(N); +}], EXTRACT_get_vextract256_imm>; + +def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec, + node:$index), + (insert_subvector node:$bigvec, node:$smallvec, + node:$index), [{ + return X86::isVINSERT256Index(N); +}], INSERT_get_vinsert256_imm>; + +def masked_load_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + if (auto *Load = dyn_cast<MaskedLoadSDNode>(N)) + return Load->getAlignment() >= 16; + return false; +}]>; + +def masked_load_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + if (auto *Load = dyn_cast<MaskedLoadSDNode>(N)) + return Load->getAlignment() >= 32; + return false; +}]>; + +def masked_load_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + if (auto *Load = dyn_cast<MaskedLoadSDNode>(N)) + return Load->getAlignment() >= 64; + return false; +}]>; + +def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + return isa<MaskedLoadSDNode>(N); +}]>; + +// masked store fragments. +// X86mstore can't be implemented in core DAG files because some targets +// doesn't support vector type ( llvm-tblgen will fail) +def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + return !cast<MaskedStoreSDNode>(N)->isTruncatingStore(); +}]>; + +def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mstore node:$src1, node:$src2, node:$src3), [{ + if (auto *Store = dyn_cast<MaskedStoreSDNode>(N)) + return Store->getAlignment() >= 16; + return false; +}]>; + +def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mstore node:$src1, node:$src2, node:$src3), [{ + if (auto *Store = dyn_cast<MaskedStoreSDNode>(N)) + return Store->getAlignment() >= 32; + return false; +}]>; + +def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mstore node:$src1, node:$src2, node:$src3), [{ + if (auto *Store = dyn_cast<MaskedStoreSDNode>(N)) + return Store->getAlignment() >= 64; + return false; +}]>; + +def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mstore node:$src1, node:$src2, node:$src3), [{ + return isa<MaskedStoreSDNode>(N); +}]>; + +// masked truncstore fragments +// X86mtruncstore can't be implemented in core DAG files because some targets +// doesn't support vector type ( llvm-tblgen will fail) +def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->isTruncatingStore(); +}]>; +def masked_truncstorevi8 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mtruncstore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def masked_truncstorevi16 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mtruncstore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; +def masked_truncstorevi32 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mtruncstore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp new file mode 100644 index 0000000..246804e --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -0,0 +1,7330 @@ +//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "X86InstrInfo.h" +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86MachineFunctionInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/StackMaps.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" +#include <limits> + +using namespace llvm; + +#define DEBUG_TYPE "x86-instr-info" + +#define GET_INSTRINFO_CTOR_DTOR +#include "X86GenInstrInfo.inc" + +static cl::opt<bool> +NoFusing("disable-spill-fusing", + cl::desc("Disable fusing of spill code into instructions")); +static cl::opt<bool> +PrintFailedFusing("print-failed-fuse-candidates", + cl::desc("Print instructions that the allocator wants to" + " fuse, but the X86 backend currently can't"), + cl::Hidden); +static cl::opt<bool> +ReMatPICStubLoad("remat-pic-stub-load", + cl::desc("Re-materialize load from stub in PIC mode"), + cl::init(false), cl::Hidden); + +enum { + // Select which memory operand is being unfolded. + // (stored in bits 0 - 3) + TB_INDEX_0 = 0, + TB_INDEX_1 = 1, + TB_INDEX_2 = 2, + TB_INDEX_3 = 3, + TB_INDEX_4 = 4, + TB_INDEX_MASK = 0xf, + + // Do not insert the reverse map (MemOp -> RegOp) into the table. + // This may be needed because there is a many -> one mapping. + TB_NO_REVERSE = 1 << 4, + + // Do not insert the forward map (RegOp -> MemOp) into the table. + // This is needed for Native Client, which prohibits branch + // instructions from using a memory operand. + TB_NO_FORWARD = 1 << 5, + + TB_FOLDED_LOAD = 1 << 6, + TB_FOLDED_STORE = 1 << 7, + + // Minimum alignment required for load/store. + // Used for RegOp->MemOp conversion. + // (stored in bits 8 - 15) + TB_ALIGN_SHIFT = 8, + TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT, + TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT, + TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT, + TB_ALIGN_64 = 64 << TB_ALIGN_SHIFT, + TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT +}; + +struct X86MemoryFoldTableEntry { + uint16_t RegOp; + uint16_t MemOp; + uint16_t Flags; +}; + +// Pin the vtable to this file. +void X86InstrInfo::anchor() {} + +X86InstrInfo::X86InstrInfo(X86Subtarget &STI) + : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 + : X86::ADJCALLSTACKDOWN32), + (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 + : X86::ADJCALLSTACKUP32), + X86::CATCHRET), + Subtarget(STI), RI(STI.getTargetTriple()) { + + static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = { + { X86::ADC32ri, X86::ADC32mi, 0 }, + { X86::ADC32ri8, X86::ADC32mi8, 0 }, + { X86::ADC32rr, X86::ADC32mr, 0 }, + { X86::ADC64ri32, X86::ADC64mi32, 0 }, + { X86::ADC64ri8, X86::ADC64mi8, 0 }, + { X86::ADC64rr, X86::ADC64mr, 0 }, + { X86::ADD16ri, X86::ADD16mi, 0 }, + { X86::ADD16ri8, X86::ADD16mi8, 0 }, + { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE }, + { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE }, + { X86::ADD16rr, X86::ADD16mr, 0 }, + { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE }, + { X86::ADD32ri, X86::ADD32mi, 0 }, + { X86::ADD32ri8, X86::ADD32mi8, 0 }, + { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE }, + { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE }, + { X86::ADD32rr, X86::ADD32mr, 0 }, + { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE }, + { X86::ADD64ri32, X86::ADD64mi32, 0 }, + { X86::ADD64ri8, X86::ADD64mi8, 0 }, + { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE }, + { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE }, + { X86::ADD64rr, X86::ADD64mr, 0 }, + { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE }, + { X86::ADD8ri, X86::ADD8mi, 0 }, + { X86::ADD8rr, X86::ADD8mr, 0 }, + { X86::AND16ri, X86::AND16mi, 0 }, + { X86::AND16ri8, X86::AND16mi8, 0 }, + { X86::AND16rr, X86::AND16mr, 0 }, + { X86::AND32ri, X86::AND32mi, 0 }, + { X86::AND32ri8, X86::AND32mi8, 0 }, + { X86::AND32rr, X86::AND32mr, 0 }, + { X86::AND64ri32, X86::AND64mi32, 0 }, + { X86::AND64ri8, X86::AND64mi8, 0 }, + { X86::AND64rr, X86::AND64mr, 0 }, + { X86::AND8ri, X86::AND8mi, 0 }, + { X86::AND8rr, X86::AND8mr, 0 }, + { X86::DEC16r, X86::DEC16m, 0 }, + { X86::DEC32r, X86::DEC32m, 0 }, + { X86::DEC64r, X86::DEC64m, 0 }, + { X86::DEC8r, X86::DEC8m, 0 }, + { X86::INC16r, X86::INC16m, 0 }, + { X86::INC32r, X86::INC32m, 0 }, + { X86::INC64r, X86::INC64m, 0 }, + { X86::INC8r, X86::INC8m, 0 }, + { X86::NEG16r, X86::NEG16m, 0 }, + { X86::NEG32r, X86::NEG32m, 0 }, + { X86::NEG64r, X86::NEG64m, 0 }, + { X86::NEG8r, X86::NEG8m, 0 }, + { X86::NOT16r, X86::NOT16m, 0 }, + { X86::NOT32r, X86::NOT32m, 0 }, + { X86::NOT64r, X86::NOT64m, 0 }, + { X86::NOT8r, X86::NOT8m, 0 }, + { X86::OR16ri, X86::OR16mi, 0 }, + { X86::OR16ri8, X86::OR16mi8, 0 }, + { X86::OR16rr, X86::OR16mr, 0 }, + { X86::OR32ri, X86::OR32mi, 0 }, + { X86::OR32ri8, X86::OR32mi8, 0 }, + { X86::OR32rr, X86::OR32mr, 0 }, + { X86::OR64ri32, X86::OR64mi32, 0 }, + { X86::OR64ri8, X86::OR64mi8, 0 }, + { X86::OR64rr, X86::OR64mr, 0 }, + { X86::OR8ri, X86::OR8mi, 0 }, + { X86::OR8rr, X86::OR8mr, 0 }, + { X86::ROL16r1, X86::ROL16m1, 0 }, + { X86::ROL16rCL, X86::ROL16mCL, 0 }, + { X86::ROL16ri, X86::ROL16mi, 0 }, + { X86::ROL32r1, X86::ROL32m1, 0 }, + { X86::ROL32rCL, X86::ROL32mCL, 0 }, + { X86::ROL32ri, X86::ROL32mi, 0 }, + { X86::ROL64r1, X86::ROL64m1, 0 }, + { X86::ROL64rCL, X86::ROL64mCL, 0 }, + { X86::ROL64ri, X86::ROL64mi, 0 }, + { X86::ROL8r1, X86::ROL8m1, 0 }, + { X86::ROL8rCL, X86::ROL8mCL, 0 }, + { X86::ROL8ri, X86::ROL8mi, 0 }, + { X86::ROR16r1, X86::ROR16m1, 0 }, + { X86::ROR16rCL, X86::ROR16mCL, 0 }, + { X86::ROR16ri, X86::ROR16mi, 0 }, + { X86::ROR32r1, X86::ROR32m1, 0 }, + { X86::ROR32rCL, X86::ROR32mCL, 0 }, + { X86::ROR32ri, X86::ROR32mi, 0 }, + { X86::ROR64r1, X86::ROR64m1, 0 }, + { X86::ROR64rCL, X86::ROR64mCL, 0 }, + { X86::ROR64ri, X86::ROR64mi, 0 }, + { X86::ROR8r1, X86::ROR8m1, 0 }, + { X86::ROR8rCL, X86::ROR8mCL, 0 }, + { X86::ROR8ri, X86::ROR8mi, 0 }, + { X86::SAR16r1, X86::SAR16m1, 0 }, + { X86::SAR16rCL, X86::SAR16mCL, 0 }, + { X86::SAR16ri, X86::SAR16mi, 0 }, + { X86::SAR32r1, X86::SAR32m1, 0 }, + { X86::SAR32rCL, X86::SAR32mCL, 0 }, + { X86::SAR32ri, X86::SAR32mi, 0 }, + { X86::SAR64r1, X86::SAR64m1, 0 }, + { X86::SAR64rCL, X86::SAR64mCL, 0 }, + { X86::SAR64ri, X86::SAR64mi, 0 }, + { X86::SAR8r1, X86::SAR8m1, 0 }, + { X86::SAR8rCL, X86::SAR8mCL, 0 }, + { X86::SAR8ri, X86::SAR8mi, 0 }, + { X86::SBB32ri, X86::SBB32mi, 0 }, + { X86::SBB32ri8, X86::SBB32mi8, 0 }, + { X86::SBB32rr, X86::SBB32mr, 0 }, + { X86::SBB64ri32, X86::SBB64mi32, 0 }, + { X86::SBB64ri8, X86::SBB64mi8, 0 }, + { X86::SBB64rr, X86::SBB64mr, 0 }, + { X86::SHL16rCL, X86::SHL16mCL, 0 }, + { X86::SHL16ri, X86::SHL16mi, 0 }, + { X86::SHL32rCL, X86::SHL32mCL, 0 }, + { X86::SHL32ri, X86::SHL32mi, 0 }, + { X86::SHL64rCL, X86::SHL64mCL, 0 }, + { X86::SHL64ri, X86::SHL64mi, 0 }, + { X86::SHL8rCL, X86::SHL8mCL, 0 }, + { X86::SHL8ri, X86::SHL8mi, 0 }, + { X86::SHLD16rrCL, X86::SHLD16mrCL, 0 }, + { X86::SHLD16rri8, X86::SHLD16mri8, 0 }, + { X86::SHLD32rrCL, X86::SHLD32mrCL, 0 }, + { X86::SHLD32rri8, X86::SHLD32mri8, 0 }, + { X86::SHLD64rrCL, X86::SHLD64mrCL, 0 }, + { X86::SHLD64rri8, X86::SHLD64mri8, 0 }, + { X86::SHR16r1, X86::SHR16m1, 0 }, + { X86::SHR16rCL, X86::SHR16mCL, 0 }, + { X86::SHR16ri, X86::SHR16mi, 0 }, + { X86::SHR32r1, X86::SHR32m1, 0 }, + { X86::SHR32rCL, X86::SHR32mCL, 0 }, + { X86::SHR32ri, X86::SHR32mi, 0 }, + { X86::SHR64r1, X86::SHR64m1, 0 }, + { X86::SHR64rCL, X86::SHR64mCL, 0 }, + { X86::SHR64ri, X86::SHR64mi, 0 }, + { X86::SHR8r1, X86::SHR8m1, 0 }, + { X86::SHR8rCL, X86::SHR8mCL, 0 }, + { X86::SHR8ri, X86::SHR8mi, 0 }, + { X86::SHRD16rrCL, X86::SHRD16mrCL, 0 }, + { X86::SHRD16rri8, X86::SHRD16mri8, 0 }, + { X86::SHRD32rrCL, X86::SHRD32mrCL, 0 }, + { X86::SHRD32rri8, X86::SHRD32mri8, 0 }, + { X86::SHRD64rrCL, X86::SHRD64mrCL, 0 }, + { X86::SHRD64rri8, X86::SHRD64mri8, 0 }, + { X86::SUB16ri, X86::SUB16mi, 0 }, + { X86::SUB16ri8, X86::SUB16mi8, 0 }, + { X86::SUB16rr, X86::SUB16mr, 0 }, + { X86::SUB32ri, X86::SUB32mi, 0 }, + { X86::SUB32ri8, X86::SUB32mi8, 0 }, + { X86::SUB32rr, X86::SUB32mr, 0 }, + { X86::SUB64ri32, X86::SUB64mi32, 0 }, + { X86::SUB64ri8, X86::SUB64mi8, 0 }, + { X86::SUB64rr, X86::SUB64mr, 0 }, + { X86::SUB8ri, X86::SUB8mi, 0 }, + { X86::SUB8rr, X86::SUB8mr, 0 }, + { X86::XOR16ri, X86::XOR16mi, 0 }, + { X86::XOR16ri8, X86::XOR16mi8, 0 }, + { X86::XOR16rr, X86::XOR16mr, 0 }, + { X86::XOR32ri, X86::XOR32mi, 0 }, + { X86::XOR32ri8, X86::XOR32mi8, 0 }, + { X86::XOR32rr, X86::XOR32mr, 0 }, + { X86::XOR64ri32, X86::XOR64mi32, 0 }, + { X86::XOR64ri8, X86::XOR64mi8, 0 }, + { X86::XOR64rr, X86::XOR64mr, 0 }, + { X86::XOR8ri, X86::XOR8mi, 0 }, + { X86::XOR8rr, X86::XOR8mr, 0 } + }; + + for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2Addr) { + AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable, + Entry.RegOp, Entry.MemOp, + // Index 0, folded load and store, no alignment requirement. + Entry.Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE); + } + + static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { + { X86::BT16ri8, X86::BT16mi8, TB_FOLDED_LOAD }, + { X86::BT32ri8, X86::BT32mi8, TB_FOLDED_LOAD }, + { X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD }, + { X86::CALL32r, X86::CALL32m, TB_FOLDED_LOAD }, + { X86::CALL64r, X86::CALL64m, TB_FOLDED_LOAD }, + { X86::CMP16ri, X86::CMP16mi, TB_FOLDED_LOAD }, + { X86::CMP16ri8, X86::CMP16mi8, TB_FOLDED_LOAD }, + { X86::CMP16rr, X86::CMP16mr, TB_FOLDED_LOAD }, + { X86::CMP32ri, X86::CMP32mi, TB_FOLDED_LOAD }, + { X86::CMP32ri8, X86::CMP32mi8, TB_FOLDED_LOAD }, + { X86::CMP32rr, X86::CMP32mr, TB_FOLDED_LOAD }, + { X86::CMP64ri32, X86::CMP64mi32, TB_FOLDED_LOAD }, + { X86::CMP64ri8, X86::CMP64mi8, TB_FOLDED_LOAD }, + { X86::CMP64rr, X86::CMP64mr, TB_FOLDED_LOAD }, + { X86::CMP8ri, X86::CMP8mi, TB_FOLDED_LOAD }, + { X86::CMP8rr, X86::CMP8mr, TB_FOLDED_LOAD }, + { X86::DIV16r, X86::DIV16m, TB_FOLDED_LOAD }, + { X86::DIV32r, X86::DIV32m, TB_FOLDED_LOAD }, + { X86::DIV64r, X86::DIV64m, TB_FOLDED_LOAD }, + { X86::DIV8r, X86::DIV8m, TB_FOLDED_LOAD }, + { X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE }, + { X86::IDIV16r, X86::IDIV16m, TB_FOLDED_LOAD }, + { X86::IDIV32r, X86::IDIV32m, TB_FOLDED_LOAD }, + { X86::IDIV64r, X86::IDIV64m, TB_FOLDED_LOAD }, + { X86::IDIV8r, X86::IDIV8m, TB_FOLDED_LOAD }, + { X86::IMUL16r, X86::IMUL16m, TB_FOLDED_LOAD }, + { X86::IMUL32r, X86::IMUL32m, TB_FOLDED_LOAD }, + { X86::IMUL64r, X86::IMUL64m, TB_FOLDED_LOAD }, + { X86::IMUL8r, X86::IMUL8m, TB_FOLDED_LOAD }, + { X86::JMP32r, X86::JMP32m, TB_FOLDED_LOAD }, + { X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD }, + { X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE }, + { X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE }, + { X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE }, + { X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE }, + { X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE }, + { X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE }, + { X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE }, + { X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE }, + { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE }, + { X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE }, + { X86::MOVPQIto64rr,X86::MOVPQI2QImr, TB_FOLDED_STORE }, + { X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE }, + { X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE }, + { X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE }, + { X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE }, + { X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD }, + { X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD }, + { X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD }, + { X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD }, + { X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE }, + { X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE }, + { X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD }, + { X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD }, + { X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD }, + { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE }, + { X86::SETAr, X86::SETAm, TB_FOLDED_STORE }, + { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE }, + { X86::SETBr, X86::SETBm, TB_FOLDED_STORE }, + { X86::SETEr, X86::SETEm, TB_FOLDED_STORE }, + { X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE }, + { X86::SETGr, X86::SETGm, TB_FOLDED_STORE }, + { X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE }, + { X86::SETLr, X86::SETLm, TB_FOLDED_STORE }, + { X86::SETNEr, X86::SETNEm, TB_FOLDED_STORE }, + { X86::SETNOr, X86::SETNOm, TB_FOLDED_STORE }, + { X86::SETNPr, X86::SETNPm, TB_FOLDED_STORE }, + { X86::SETNSr, X86::SETNSm, TB_FOLDED_STORE }, + { X86::SETOr, X86::SETOm, TB_FOLDED_STORE }, + { X86::SETPr, X86::SETPm, TB_FOLDED_STORE }, + { X86::SETSr, X86::SETSm, TB_FOLDED_STORE }, + { X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD }, + { X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD }, + { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD }, + { X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD }, + { X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD }, + { X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD }, + { X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD }, + + // AVX 128-bit versions of foldable instructions + { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE }, + { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE }, + { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE }, + { X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE }, + { X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE }, + { X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE }, + { X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE }, + { X86::VPEXTRDrr, X86::VPEXTRDmr, TB_FOLDED_STORE }, + { X86::VPEXTRQrr, X86::VPEXTRQmr, TB_FOLDED_STORE }, + + // AVX 256-bit foldable instructions + { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE }, + { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE }, + + // AVX-512 foldable instructions + { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE }, + { X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, + { X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, + { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 }, + { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zmr, TB_FOLDED_STORE | TB_ALIGN_64 }, + { X86::VMOVUPDZrr, X86::VMOVUPDZmr, TB_FOLDED_STORE }, + { X86::VMOVUPSZrr, X86::VMOVUPSZmr, TB_FOLDED_STORE }, + { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zmr, TB_FOLDED_STORE }, + { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE }, + { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE }, + { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE }, + + // AVX-512 foldable instructions (256-bit versions) + { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256mr, TB_FOLDED_STORE }, + { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256mr, TB_FOLDED_STORE }, + { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256mr, TB_FOLDED_STORE }, + { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE }, + { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE }, + { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE }, + + // AVX-512 foldable instructions (128-bit versions) + { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128mr, TB_FOLDED_STORE }, + { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128mr, TB_FOLDED_STORE }, + { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128mr, TB_FOLDED_STORE }, + { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE }, + { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE }, + { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE }, + + // F16C foldable instructions + { X86::VCVTPS2PHrr, X86::VCVTPS2PHmr, TB_FOLDED_STORE }, + { X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE } + }; + + for (X86MemoryFoldTableEntry Entry : MemoryFoldTable0) { + AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable, + Entry.RegOp, Entry.MemOp, TB_INDEX_0 | Entry.Flags); + } + + static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { + { X86::BSF16rr, X86::BSF16rm, 0 }, + { X86::BSF32rr, X86::BSF32rm, 0 }, + { X86::BSF64rr, X86::BSF64rm, 0 }, + { X86::BSR16rr, X86::BSR16rm, 0 }, + { X86::BSR32rr, X86::BSR32rm, 0 }, + { X86::BSR64rr, X86::BSR64rm, 0 }, + { X86::CMP16rr, X86::CMP16rm, 0 }, + { X86::CMP32rr, X86::CMP32rm, 0 }, + { X86::CMP64rr, X86::CMP64rm, 0 }, + { X86::CMP8rr, X86::CMP8rm, 0 }, + { X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 }, + { X86::CVTSI2SD64rr, X86::CVTSI2SD64rm, 0 }, + { X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 }, + { X86::CVTSI2SS64rr, X86::CVTSI2SS64rm, 0 }, + { X86::CVTSI2SSrr, X86::CVTSI2SSrm, 0 }, + { X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 }, + { X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm, 0 }, + { X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 }, + { X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 }, + { X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 }, + { X86::IMUL16rri, X86::IMUL16rmi, 0 }, + { X86::IMUL16rri8, X86::IMUL16rmi8, 0 }, + { X86::IMUL32rri, X86::IMUL32rmi, 0 }, + { X86::IMUL32rri8, X86::IMUL32rmi8, 0 }, + { X86::IMUL64rri32, X86::IMUL64rmi32, 0 }, + { X86::IMUL64rri8, X86::IMUL64rmi8, 0 }, + { X86::Int_COMISDrr, X86::Int_COMISDrm, 0 }, + { X86::Int_COMISSrr, X86::Int_COMISSrm, 0 }, + { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, 0 }, + { X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 }, + { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 }, + { X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 }, + { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_ALIGN_16 }, + { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 }, + { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 }, + { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 }, + { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 }, + { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_ALIGN_16 }, + { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 }, + { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 }, + { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 }, + { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, 0 }, + { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, 0 }, + { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, 0 }, + { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, 0 }, + { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, 0 }, + { X86::MOV16rr, X86::MOV16rm, 0 }, + { X86::MOV32rr, X86::MOV32rm, 0 }, + { X86::MOV64rr, X86::MOV64rm, 0 }, + { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 }, + { X86::MOV64toSDrr, X86::MOV64toSDrm, 0 }, + { X86::MOV8rr, X86::MOV8rm, 0 }, + { X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 }, + { X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 }, + { X86::MOVDDUPrr, X86::MOVDDUPrm, 0 }, + { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 }, + { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 }, + { X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 }, + { X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 }, + { X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 }, + { X86::MOVSX16rr8, X86::MOVSX16rm8, 0 }, + { X86::MOVSX32rr16, X86::MOVSX32rm16, 0 }, + { X86::MOVSX32rr8, X86::MOVSX32rm8, 0 }, + { X86::MOVSX64rr16, X86::MOVSX64rm16, 0 }, + { X86::MOVSX64rr32, X86::MOVSX64rm32, 0 }, + { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 }, + { X86::MOVUPDrr, X86::MOVUPDrm, TB_ALIGN_16 }, + { X86::MOVUPSrr, X86::MOVUPSrm, 0 }, + { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 }, + { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 }, + { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 }, + { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 }, + { X86::MOVZX32rr8, X86::MOVZX32rm8, 0 }, + { X86::PABSBrr128, X86::PABSBrm128, TB_ALIGN_16 }, + { X86::PABSDrr128, X86::PABSDrm128, TB_ALIGN_16 }, + { X86::PABSWrr128, X86::PABSWrm128, TB_ALIGN_16 }, + { X86::PCMPESTRIrr, X86::PCMPESTRIrm, TB_ALIGN_16 }, + { X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, TB_ALIGN_16 }, + { X86::PCMPISTRIrr, X86::PCMPISTRIrm, TB_ALIGN_16 }, + { X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, TB_ALIGN_16 }, + { X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128, TB_ALIGN_16 }, + { X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_ALIGN_16 }, + { X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_ALIGN_16 }, + { X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_ALIGN_16 }, + { X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_ALIGN_16 }, + { X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_ALIGN_16 }, + { X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_ALIGN_16 }, + { X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_ALIGN_16 }, + { X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_ALIGN_16 }, + { X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_ALIGN_16 }, + { X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_ALIGN_16 }, + { X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_ALIGN_16 }, + { X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_ALIGN_16 }, + { X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 }, + { X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 }, + { X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 }, + { X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 }, + { X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 }, + { X86::RCPSSr, X86::RCPSSm, 0 }, + { X86::RCPSSr_Int, X86::RCPSSm_Int, 0 }, + { X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 }, + { X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 }, + { X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 }, + { X86::RSQRTSSr, X86::RSQRTSSm, 0 }, + { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, 0 }, + { X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 }, + { X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 }, + { X86::SQRTSDr, X86::SQRTSDm, 0 }, + { X86::SQRTSDr_Int, X86::SQRTSDm_Int, 0 }, + { X86::SQRTSSr, X86::SQRTSSm, 0 }, + { X86::SQRTSSr_Int, X86::SQRTSSm_Int, 0 }, + { X86::TEST16rr, X86::TEST16rm, 0 }, + { X86::TEST32rr, X86::TEST32rm, 0 }, + { X86::TEST64rr, X86::TEST64rm, 0 }, + { X86::TEST8rr, X86::TEST8rm, 0 }, + // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0 + { X86::UCOMISDrr, X86::UCOMISDrm, 0 }, + { X86::UCOMISSrr, X86::UCOMISSrm, 0 }, + + // MMX version of foldable instructions + { X86::MMX_CVTPD2PIirr, X86::MMX_CVTPD2PIirm, 0 }, + { X86::MMX_CVTPI2PDirr, X86::MMX_CVTPI2PDirm, 0 }, + { X86::MMX_CVTPS2PIirr, X86::MMX_CVTPS2PIirm, 0 }, + { X86::MMX_CVTTPD2PIirr, X86::MMX_CVTTPD2PIirm, 0 }, + { X86::MMX_CVTTPS2PIirr, X86::MMX_CVTTPS2PIirm, 0 }, + { X86::MMX_MOVD64to64rr, X86::MMX_MOVQ64rm, 0 }, + { X86::MMX_PABSBrr64, X86::MMX_PABSBrm64, 0 }, + { X86::MMX_PABSDrr64, X86::MMX_PABSDrm64, 0 }, + { X86::MMX_PABSWrr64, X86::MMX_PABSWrm64, 0 }, + { X86::MMX_PSHUFWri, X86::MMX_PSHUFWmi, 0 }, + + // 3DNow! version of foldable instructions + { X86::PF2IDrr, X86::PF2IDrm, 0 }, + { X86::PF2IWrr, X86::PF2IWrm, 0 }, + { X86::PFRCPrr, X86::PFRCPrm, 0 }, + { X86::PFRSQRTrr, X86::PFRSQRTrm, 0 }, + { X86::PI2FDrr, X86::PI2FDrm, 0 }, + { X86::PI2FWrr, X86::PI2FWrm, 0 }, + { X86::PSWAPDrr, X86::PSWAPDrm, 0 }, + + // AVX 128-bit versions of foldable instructions + { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, 0 }, + { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, 0 }, + { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, 0 }, + { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, 0 }, + { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 }, + { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,0 }, + { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 }, + { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, 0 }, + { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 }, + { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,0 }, + { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 }, + { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, 0 }, + { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, 0 }, + { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 }, + { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 }, + { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 }, + { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, 0 }, + { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 }, + { X86::VCVTPD2DQrr, X86::VCVTPD2DQXrm, 0 }, + { X86::VCVTPD2PSrr, X86::VCVTPD2PSXrm, 0 }, + { X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 }, + { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, 0 }, + { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 }, + { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 }, + { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 }, + { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 }, + { X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 }, + { X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 }, + { X86::VMOVDDUPrr, X86::VMOVDDUPrm, 0 }, + { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 }, + { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 }, + { X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 }, + { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, 0 }, + { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 }, + { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 }, + { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 }, + { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 }, + { X86::VPABSBrr128, X86::VPABSBrm128, 0 }, + { X86::VPABSDrr128, X86::VPABSDrm128, 0 }, + { X86::VPABSWrr128, X86::VPABSWrm128, 0 }, + { X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 }, + { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 }, + { X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 }, + { X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm, 0 }, + { X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128, 0 }, + { X86::VPERMILPDri, X86::VPERMILPDmi, 0 }, + { X86::VPERMILPSri, X86::VPERMILPSmi, 0 }, + { X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, 0 }, + { X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, 0 }, + { X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, 0 }, + { X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, 0 }, + { X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, 0 }, + { X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, 0 }, + { X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, 0 }, + { X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, 0 }, + { X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, 0 }, + { X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, 0 }, + { X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, 0 }, + { X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, 0 }, + { X86::VPSHUFDri, X86::VPSHUFDmi, 0 }, + { X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 }, + { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 }, + { X86::VPTESTrr, X86::VPTESTrm, 0 }, + { X86::VRCPPSr, X86::VRCPPSm, 0 }, + { X86::VROUNDPDr, X86::VROUNDPDm, 0 }, + { X86::VROUNDPSr, X86::VROUNDPSm, 0 }, + { X86::VRSQRTPSr, X86::VRSQRTPSm, 0 }, + { X86::VSQRTPDr, X86::VSQRTPDm, 0 }, + { X86::VSQRTPSr, X86::VSQRTPSm, 0 }, + { X86::VTESTPDrr, X86::VTESTPDrm, 0 }, + { X86::VTESTPSrr, X86::VTESTPSrm, 0 }, + { X86::VUCOMISDrr, X86::VUCOMISDrm, 0 }, + { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 }, + + // AVX 256-bit foldable instructions + { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 }, + { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 }, + { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 }, + { X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 }, + { X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 }, + { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, 0 }, + { X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 }, + { X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 }, + { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 }, + { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 }, + { X86::VMOVDDUPYrr, X86::VMOVDDUPYrm, 0 }, + { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 }, + { X86::VMOVSLDUPYrr, X86::VMOVSLDUPYrm, 0 }, + { X86::VMOVSHDUPYrr, X86::VMOVSHDUPYrm, 0 }, + { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 }, + { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 }, + { X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 }, + { X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 }, + { X86::VPTESTYrr, X86::VPTESTYrm, 0 }, + { X86::VRCPPSYr, X86::VRCPPSYm, 0 }, + { X86::VROUNDYPDr, X86::VROUNDYPDm, 0 }, + { X86::VROUNDYPSr, X86::VROUNDYPSm, 0 }, + { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 }, + { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 }, + { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 }, + { X86::VTESTPDYrr, X86::VTESTPDYrm, 0 }, + { X86::VTESTPSYrr, X86::VTESTPSYrm, 0 }, + + // AVX2 foldable instructions + + // VBROADCASTS{SD}rr register instructions were an AVX2 addition while the + // VBROADCASTS{SD}rm memory instructions were available from AVX1. + // TB_NO_REVERSE prevents unfolding from introducing an illegal instruction + // on AVX1 targets. The VPBROADCAST instructions are all AVX2 instructions + // so they don't need an equivalent limitation. + { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE }, + { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, + { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, + { X86::VPABSBrr256, X86::VPABSBrm256, 0 }, + { X86::VPABSDrr256, X86::VPABSDrm256, 0 }, + { X86::VPABSWrr256, X86::VPABSWrm256, 0 }, + { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, 0 }, + { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, 0 }, + { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, 0 }, + { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, 0 }, + { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, 0 }, + { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, 0 }, + { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, 0 }, + { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, 0 }, + { X86::VPERMPDYri, X86::VPERMPDYmi, 0 }, + { X86::VPERMQYri, X86::VPERMQYmi, 0 }, + { X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, 0 }, + { X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, 0 }, + { X86::VPMOVSXBWYrr, X86::VPMOVSXBWYrm, 0 }, + { X86::VPMOVSXDQYrr, X86::VPMOVSXDQYrm, 0 }, + { X86::VPMOVSXWDYrr, X86::VPMOVSXWDYrm, 0 }, + { X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, 0 }, + { X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, 0 }, + { X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, 0 }, + { X86::VPMOVZXBWYrr, X86::VPMOVZXBWYrm, 0 }, + { X86::VPMOVZXDQYrr, X86::VPMOVZXDQYrm, 0 }, + { X86::VPMOVZXWDYrr, X86::VPMOVZXWDYrm, 0 }, + { X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, 0 }, + { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 }, + { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 }, + { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 }, + + // XOP foldable instructions + { X86::VFRCZPDrr, X86::VFRCZPDrm, 0 }, + { X86::VFRCZPDrrY, X86::VFRCZPDrmY, 0 }, + { X86::VFRCZPSrr, X86::VFRCZPSrm, 0 }, + { X86::VFRCZPSrrY, X86::VFRCZPSrmY, 0 }, + { X86::VFRCZSDrr, X86::VFRCZSDrm, 0 }, + { X86::VFRCZSSrr, X86::VFRCZSSrm, 0 }, + { X86::VPHADDBDrr, X86::VPHADDBDrm, 0 }, + { X86::VPHADDBQrr, X86::VPHADDBQrm, 0 }, + { X86::VPHADDBWrr, X86::VPHADDBWrm, 0 }, + { X86::VPHADDDQrr, X86::VPHADDDQrm, 0 }, + { X86::VPHADDWDrr, X86::VPHADDWDrm, 0 }, + { X86::VPHADDWQrr, X86::VPHADDWQrm, 0 }, + { X86::VPHADDUBDrr, X86::VPHADDUBDrm, 0 }, + { X86::VPHADDUBQrr, X86::VPHADDUBQrm, 0 }, + { X86::VPHADDUBWrr, X86::VPHADDUBWrm, 0 }, + { X86::VPHADDUDQrr, X86::VPHADDUDQrm, 0 }, + { X86::VPHADDUWDrr, X86::VPHADDUWDrm, 0 }, + { X86::VPHADDUWQrr, X86::VPHADDUWQrm, 0 }, + { X86::VPHSUBBWrr, X86::VPHSUBBWrm, 0 }, + { X86::VPHSUBDQrr, X86::VPHSUBDQrm, 0 }, + { X86::VPHSUBWDrr, X86::VPHSUBWDrm, 0 }, + { X86::VPROTBri, X86::VPROTBmi, 0 }, + { X86::VPROTBrr, X86::VPROTBmr, 0 }, + { X86::VPROTDri, X86::VPROTDmi, 0 }, + { X86::VPROTDrr, X86::VPROTDmr, 0 }, + { X86::VPROTQri, X86::VPROTQmi, 0 }, + { X86::VPROTQrr, X86::VPROTQmr, 0 }, + { X86::VPROTWri, X86::VPROTWmi, 0 }, + { X86::VPROTWrr, X86::VPROTWmr, 0 }, + { X86::VPSHABrr, X86::VPSHABmr, 0 }, + { X86::VPSHADrr, X86::VPSHADmr, 0 }, + { X86::VPSHAQrr, X86::VPSHAQmr, 0 }, + { X86::VPSHAWrr, X86::VPSHAWmr, 0 }, + { X86::VPSHLBrr, X86::VPSHLBmr, 0 }, + { X86::VPSHLDrr, X86::VPSHLDmr, 0 }, + { X86::VPSHLQrr, X86::VPSHLQmr, 0 }, + { X86::VPSHLWrr, X86::VPSHLWmr, 0 }, + + // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions + { X86::BEXTR32rr, X86::BEXTR32rm, 0 }, + { X86::BEXTR64rr, X86::BEXTR64rm, 0 }, + { X86::BEXTRI32ri, X86::BEXTRI32mi, 0 }, + { X86::BEXTRI64ri, X86::BEXTRI64mi, 0 }, + { X86::BLCFILL32rr, X86::BLCFILL32rm, 0 }, + { X86::BLCFILL64rr, X86::BLCFILL64rm, 0 }, + { X86::BLCI32rr, X86::BLCI32rm, 0 }, + { X86::BLCI64rr, X86::BLCI64rm, 0 }, + { X86::BLCIC32rr, X86::BLCIC32rm, 0 }, + { X86::BLCIC64rr, X86::BLCIC64rm, 0 }, + { X86::BLCMSK32rr, X86::BLCMSK32rm, 0 }, + { X86::BLCMSK64rr, X86::BLCMSK64rm, 0 }, + { X86::BLCS32rr, X86::BLCS32rm, 0 }, + { X86::BLCS64rr, X86::BLCS64rm, 0 }, + { X86::BLSFILL32rr, X86::BLSFILL32rm, 0 }, + { X86::BLSFILL64rr, X86::BLSFILL64rm, 0 }, + { X86::BLSI32rr, X86::BLSI32rm, 0 }, + { X86::BLSI64rr, X86::BLSI64rm, 0 }, + { X86::BLSIC32rr, X86::BLSIC32rm, 0 }, + { X86::BLSIC64rr, X86::BLSIC64rm, 0 }, + { X86::BLSMSK32rr, X86::BLSMSK32rm, 0 }, + { X86::BLSMSK64rr, X86::BLSMSK64rm, 0 }, + { X86::BLSR32rr, X86::BLSR32rm, 0 }, + { X86::BLSR64rr, X86::BLSR64rm, 0 }, + { X86::BZHI32rr, X86::BZHI32rm, 0 }, + { X86::BZHI64rr, X86::BZHI64rm, 0 }, + { X86::LZCNT16rr, X86::LZCNT16rm, 0 }, + { X86::LZCNT32rr, X86::LZCNT32rm, 0 }, + { X86::LZCNT64rr, X86::LZCNT64rm, 0 }, + { X86::POPCNT16rr, X86::POPCNT16rm, 0 }, + { X86::POPCNT32rr, X86::POPCNT32rm, 0 }, + { X86::POPCNT64rr, X86::POPCNT64rm, 0 }, + { X86::RORX32ri, X86::RORX32mi, 0 }, + { X86::RORX64ri, X86::RORX64mi, 0 }, + { X86::SARX32rr, X86::SARX32rm, 0 }, + { X86::SARX64rr, X86::SARX64rm, 0 }, + { X86::SHRX32rr, X86::SHRX32rm, 0 }, + { X86::SHRX64rr, X86::SHRX64rm, 0 }, + { X86::SHLX32rr, X86::SHLX32rm, 0 }, + { X86::SHLX64rr, X86::SHLX64rm, 0 }, + { X86::T1MSKC32rr, X86::T1MSKC32rm, 0 }, + { X86::T1MSKC64rr, X86::T1MSKC64rm, 0 }, + { X86::TZCNT16rr, X86::TZCNT16rm, 0 }, + { X86::TZCNT32rr, X86::TZCNT32rm, 0 }, + { X86::TZCNT64rr, X86::TZCNT64rm, 0 }, + { X86::TZMSK32rr, X86::TZMSK32rm, 0 }, + { X86::TZMSK64rr, X86::TZMSK64rm, 0 }, + + // AVX-512 foldable instructions + { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 }, + { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 }, + { X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 }, + { X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 }, + { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 }, + { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zrm, TB_ALIGN_64 }, + { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zrm, 0 }, + { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zrm, 0 }, + { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zrm, 0 }, + { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zrm, 0 }, + { X86::VMOVUPDZrr, X86::VMOVUPDZrm, 0 }, + { X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 }, + { X86::VPABSDZrr, X86::VPABSDZrm, 0 }, + { X86::VPABSQZrr, X86::VPABSQZrm, 0 }, + { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE }, + { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE }, + + // AVX-512 foldable instructions (256-bit versions) + { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 }, + { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 }, + { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 }, + { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256rm, TB_ALIGN_32 }, + { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256rm, 0 }, + { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256rm, 0 }, + { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256rm, 0 }, + { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 }, + { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 }, + { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 }, + { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, + + // AVX-512 foldable instructions (256-bit versions) + { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 }, + { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 }, + { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 }, + { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128rm, TB_ALIGN_16 }, + { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128rm, 0 }, + { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128rm, 0 }, + { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128rm, 0 }, + { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 }, + { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 }, + { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 }, + { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, + + // F16C foldable instructions + { X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 }, + { X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 }, + + // AES foldable instructions + { X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 }, + { X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 }, + { X86::VAESIMCrr, X86::VAESIMCrm, 0 }, + { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 } + }; + + for (X86MemoryFoldTableEntry Entry : MemoryFoldTable1) { + AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable, + Entry.RegOp, Entry.MemOp, + // Index 1, folded load + Entry.Flags | TB_INDEX_1 | TB_FOLDED_LOAD); + } + + static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { + { X86::ADC32rr, X86::ADC32rm, 0 }, + { X86::ADC64rr, X86::ADC64rm, 0 }, + { X86::ADD16rr, X86::ADD16rm, 0 }, + { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE }, + { X86::ADD32rr, X86::ADD32rm, 0 }, + { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE }, + { X86::ADD64rr, X86::ADD64rm, 0 }, + { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE }, + { X86::ADD8rr, X86::ADD8rm, 0 }, + { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 }, + { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 }, + { X86::ADDSDrr, X86::ADDSDrm, 0 }, + { X86::ADDSDrr_Int, X86::ADDSDrm_Int, 0 }, + { X86::ADDSSrr, X86::ADDSSrm, 0 }, + { X86::ADDSSrr_Int, X86::ADDSSrm_Int, 0 }, + { X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 }, + { X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 }, + { X86::AND16rr, X86::AND16rm, 0 }, + { X86::AND32rr, X86::AND32rm, 0 }, + { X86::AND64rr, X86::AND64rm, 0 }, + { X86::AND8rr, X86::AND8rm, 0 }, + { X86::ANDNPDrr, X86::ANDNPDrm, TB_ALIGN_16 }, + { X86::ANDNPSrr, X86::ANDNPSrm, TB_ALIGN_16 }, + { X86::ANDPDrr, X86::ANDPDrm, TB_ALIGN_16 }, + { X86::ANDPSrr, X86::ANDPSrm, TB_ALIGN_16 }, + { X86::BLENDPDrri, X86::BLENDPDrmi, TB_ALIGN_16 }, + { X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 }, + { X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 }, + { X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 }, + { X86::CMOVA16rr, X86::CMOVA16rm, 0 }, + { X86::CMOVA32rr, X86::CMOVA32rm, 0 }, + { X86::CMOVA64rr, X86::CMOVA64rm, 0 }, + { X86::CMOVAE16rr, X86::CMOVAE16rm, 0 }, + { X86::CMOVAE32rr, X86::CMOVAE32rm, 0 }, + { X86::CMOVAE64rr, X86::CMOVAE64rm, 0 }, + { X86::CMOVB16rr, X86::CMOVB16rm, 0 }, + { X86::CMOVB32rr, X86::CMOVB32rm, 0 }, + { X86::CMOVB64rr, X86::CMOVB64rm, 0 }, + { X86::CMOVBE16rr, X86::CMOVBE16rm, 0 }, + { X86::CMOVBE32rr, X86::CMOVBE32rm, 0 }, + { X86::CMOVBE64rr, X86::CMOVBE64rm, 0 }, + { X86::CMOVE16rr, X86::CMOVE16rm, 0 }, + { X86::CMOVE32rr, X86::CMOVE32rm, 0 }, + { X86::CMOVE64rr, X86::CMOVE64rm, 0 }, + { X86::CMOVG16rr, X86::CMOVG16rm, 0 }, + { X86::CMOVG32rr, X86::CMOVG32rm, 0 }, + { X86::CMOVG64rr, X86::CMOVG64rm, 0 }, + { X86::CMOVGE16rr, X86::CMOVGE16rm, 0 }, + { X86::CMOVGE32rr, X86::CMOVGE32rm, 0 }, + { X86::CMOVGE64rr, X86::CMOVGE64rm, 0 }, + { X86::CMOVL16rr, X86::CMOVL16rm, 0 }, + { X86::CMOVL32rr, X86::CMOVL32rm, 0 }, + { X86::CMOVL64rr, X86::CMOVL64rm, 0 }, + { X86::CMOVLE16rr, X86::CMOVLE16rm, 0 }, + { X86::CMOVLE32rr, X86::CMOVLE32rm, 0 }, + { X86::CMOVLE64rr, X86::CMOVLE64rm, 0 }, + { X86::CMOVNE16rr, X86::CMOVNE16rm, 0 }, + { X86::CMOVNE32rr, X86::CMOVNE32rm, 0 }, + { X86::CMOVNE64rr, X86::CMOVNE64rm, 0 }, + { X86::CMOVNO16rr, X86::CMOVNO16rm, 0 }, + { X86::CMOVNO32rr, X86::CMOVNO32rm, 0 }, + { X86::CMOVNO64rr, X86::CMOVNO64rm, 0 }, + { X86::CMOVNP16rr, X86::CMOVNP16rm, 0 }, + { X86::CMOVNP32rr, X86::CMOVNP32rm, 0 }, + { X86::CMOVNP64rr, X86::CMOVNP64rm, 0 }, + { X86::CMOVNS16rr, X86::CMOVNS16rm, 0 }, + { X86::CMOVNS32rr, X86::CMOVNS32rm, 0 }, + { X86::CMOVNS64rr, X86::CMOVNS64rm, 0 }, + { X86::CMOVO16rr, X86::CMOVO16rm, 0 }, + { X86::CMOVO32rr, X86::CMOVO32rm, 0 }, + { X86::CMOVO64rr, X86::CMOVO64rm, 0 }, + { X86::CMOVP16rr, X86::CMOVP16rm, 0 }, + { X86::CMOVP32rr, X86::CMOVP32rm, 0 }, + { X86::CMOVP64rr, X86::CMOVP64rm, 0 }, + { X86::CMOVS16rr, X86::CMOVS16rm, 0 }, + { X86::CMOVS32rr, X86::CMOVS32rm, 0 }, + { X86::CMOVS64rr, X86::CMOVS64rm, 0 }, + { X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 }, + { X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 }, + { X86::CMPSDrr, X86::CMPSDrm, 0 }, + { X86::CMPSSrr, X86::CMPSSrm, 0 }, + { X86::CRC32r32r32, X86::CRC32r32m32, 0 }, + { X86::CRC32r64r64, X86::CRC32r64m64, 0 }, + { X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 }, + { X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 }, + { X86::DIVSDrr, X86::DIVSDrm, 0 }, + { X86::DIVSDrr_Int, X86::DIVSDrm_Int, 0 }, + { X86::DIVSSrr, X86::DIVSSrm, 0 }, + { X86::DIVSSrr_Int, X86::DIVSSrm_Int, 0 }, + { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 }, + { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 }, + + // Do not fold Fs* scalar logical op loads because there are no scalar + // load variants for these instructions. When folded, the load is required + // to be 128-bits, so the load size would not match. + + { X86::FvANDNPDrr, X86::FvANDNPDrm, TB_ALIGN_16 }, + { X86::FvANDNPSrr, X86::FvANDNPSrm, TB_ALIGN_16 }, + { X86::FvANDPDrr, X86::FvANDPDrm, TB_ALIGN_16 }, + { X86::FvANDPSrr, X86::FvANDPSrm, TB_ALIGN_16 }, + { X86::FvORPDrr, X86::FvORPDrm, TB_ALIGN_16 }, + { X86::FvORPSrr, X86::FvORPSrm, TB_ALIGN_16 }, + { X86::FvXORPDrr, X86::FvXORPDrm, TB_ALIGN_16 }, + { X86::FvXORPSrr, X86::FvXORPSrm, TB_ALIGN_16 }, + { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 }, + { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 }, + { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 }, + { X86::HSUBPSrr, X86::HSUBPSrm, TB_ALIGN_16 }, + { X86::IMUL16rr, X86::IMUL16rm, 0 }, + { X86::IMUL32rr, X86::IMUL32rm, 0 }, + { X86::IMUL64rr, X86::IMUL64rm, 0 }, + { X86::Int_CMPSDrr, X86::Int_CMPSDrm, 0 }, + { X86::Int_CMPSSrr, X86::Int_CMPSSrm, 0 }, + { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, 0 }, + { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 }, + { X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 }, + { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 }, + { X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 }, + { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, 0 }, + { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 }, + { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 }, + { X86::MAXSDrr, X86::MAXSDrm, 0 }, + { X86::MAXSDrr_Int, X86::MAXSDrm_Int, 0 }, + { X86::MAXSSrr, X86::MAXSSrm, 0 }, + { X86::MAXSSrr_Int, X86::MAXSSrm_Int, 0 }, + { X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 }, + { X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 }, + { X86::MINSDrr, X86::MINSDrm, 0 }, + { X86::MINSDrr_Int, X86::MINSDrm_Int, 0 }, + { X86::MINSSrr, X86::MINSSrm, 0 }, + { X86::MINSSrr_Int, X86::MINSSrm_Int, 0 }, + { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 }, + { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 }, + { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 }, + { X86::MULSDrr, X86::MULSDrm, 0 }, + { X86::MULSDrr_Int, X86::MULSDrm_Int, 0 }, + { X86::MULSSrr, X86::MULSSrm, 0 }, + { X86::MULSSrr_Int, X86::MULSSrm_Int, 0 }, + { X86::OR16rr, X86::OR16rm, 0 }, + { X86::OR32rr, X86::OR32rm, 0 }, + { X86::OR64rr, X86::OR64rm, 0 }, + { X86::OR8rr, X86::OR8rm, 0 }, + { X86::ORPDrr, X86::ORPDrm, TB_ALIGN_16 }, + { X86::ORPSrr, X86::ORPSrm, TB_ALIGN_16 }, + { X86::PACKSSDWrr, X86::PACKSSDWrm, TB_ALIGN_16 }, + { X86::PACKSSWBrr, X86::PACKSSWBrm, TB_ALIGN_16 }, + { X86::PACKUSDWrr, X86::PACKUSDWrm, TB_ALIGN_16 }, + { X86::PACKUSWBrr, X86::PACKUSWBrm, TB_ALIGN_16 }, + { X86::PADDBrr, X86::PADDBrm, TB_ALIGN_16 }, + { X86::PADDDrr, X86::PADDDrm, TB_ALIGN_16 }, + { X86::PADDQrr, X86::PADDQrm, TB_ALIGN_16 }, + { X86::PADDSBrr, X86::PADDSBrm, TB_ALIGN_16 }, + { X86::PADDSWrr, X86::PADDSWrm, TB_ALIGN_16 }, + { X86::PADDUSBrr, X86::PADDUSBrm, TB_ALIGN_16 }, + { X86::PADDUSWrr, X86::PADDUSWrm, TB_ALIGN_16 }, + { X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 }, + { X86::PALIGNR128rr, X86::PALIGNR128rm, TB_ALIGN_16 }, + { X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 }, + { X86::PANDrr, X86::PANDrm, TB_ALIGN_16 }, + { X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 }, + { X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 }, + { X86::PBLENDVBrr0, X86::PBLENDVBrm0, TB_ALIGN_16 }, + { X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 }, + { X86::PCLMULQDQrr, X86::PCLMULQDQrm, TB_ALIGN_16 }, + { X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 }, + { X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 }, + { X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 }, + { X86::PCMPEQWrr, X86::PCMPEQWrm, TB_ALIGN_16 }, + { X86::PCMPGTBrr, X86::PCMPGTBrm, TB_ALIGN_16 }, + { X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 }, + { X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16 }, + { X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 }, + { X86::PHADDDrr, X86::PHADDDrm, TB_ALIGN_16 }, + { X86::PHADDWrr, X86::PHADDWrm, TB_ALIGN_16 }, + { X86::PHADDSWrr128, X86::PHADDSWrm128, TB_ALIGN_16 }, + { X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 }, + { X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 }, + { X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 }, + { X86::PINSRBrr, X86::PINSRBrm, 0 }, + { X86::PINSRDrr, X86::PINSRDrm, 0 }, + { X86::PINSRQrr, X86::PINSRQrm, 0 }, + { X86::PINSRWrri, X86::PINSRWrmi, 0 }, + { X86::PMADDUBSWrr128, X86::PMADDUBSWrm128, TB_ALIGN_16 }, + { X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 }, + { X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 }, + { X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 }, + { X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 }, + { X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 }, + { X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 }, + { X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 }, + { X86::PMINUDrr, X86::PMINUDrm, TB_ALIGN_16 }, + { X86::PMINUWrr, X86::PMINUWrm, TB_ALIGN_16 }, + { X86::PMAXSBrr, X86::PMAXSBrm, TB_ALIGN_16 }, + { X86::PMAXSDrr, X86::PMAXSDrm, TB_ALIGN_16 }, + { X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 }, + { X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 }, + { X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 }, + { X86::PMULHRSWrr128, X86::PMULHRSWrm128, TB_ALIGN_16 }, + { X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 }, + { X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 }, + { X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 }, + { X86::PMULLWrr, X86::PMULLWrm, TB_ALIGN_16 }, + { X86::PMULUDQrr, X86::PMULUDQrm, TB_ALIGN_16 }, + { X86::PORrr, X86::PORrm, TB_ALIGN_16 }, + { X86::PSADBWrr, X86::PSADBWrm, TB_ALIGN_16 }, + { X86::PSHUFBrr, X86::PSHUFBrm, TB_ALIGN_16 }, + { X86::PSIGNBrr, X86::PSIGNBrm, TB_ALIGN_16 }, + { X86::PSIGNWrr, X86::PSIGNWrm, TB_ALIGN_16 }, + { X86::PSIGNDrr, X86::PSIGNDrm, TB_ALIGN_16 }, + { X86::PSLLDrr, X86::PSLLDrm, TB_ALIGN_16 }, + { X86::PSLLQrr, X86::PSLLQrm, TB_ALIGN_16 }, + { X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 }, + { X86::PSRADrr, X86::PSRADrm, TB_ALIGN_16 }, + { X86::PSRAWrr, X86::PSRAWrm, TB_ALIGN_16 }, + { X86::PSRLDrr, X86::PSRLDrm, TB_ALIGN_16 }, + { X86::PSRLQrr, X86::PSRLQrm, TB_ALIGN_16 }, + { X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 }, + { X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 }, + { X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 }, + { X86::PSUBQrr, X86::PSUBQrm, TB_ALIGN_16 }, + { X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 }, + { X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 }, + { X86::PSUBUSBrr, X86::PSUBUSBrm, TB_ALIGN_16 }, + { X86::PSUBUSWrr, X86::PSUBUSWrm, TB_ALIGN_16 }, + { X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 }, + { X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 }, + { X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 }, + { X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm, TB_ALIGN_16 }, + { X86::PUNPCKHWDrr, X86::PUNPCKHWDrm, TB_ALIGN_16 }, + { X86::PUNPCKLBWrr, X86::PUNPCKLBWrm, TB_ALIGN_16 }, + { X86::PUNPCKLDQrr, X86::PUNPCKLDQrm, TB_ALIGN_16 }, + { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 }, + { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 }, + { X86::PXORrr, X86::PXORrm, TB_ALIGN_16 }, + { X86::ROUNDSDr, X86::ROUNDSDm, 0 }, + { X86::ROUNDSSr, X86::ROUNDSSm, 0 }, + { X86::SBB32rr, X86::SBB32rm, 0 }, + { X86::SBB64rr, X86::SBB64rm, 0 }, + { X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 }, + { X86::SHUFPSrri, X86::SHUFPSrmi, TB_ALIGN_16 }, + { X86::SUB16rr, X86::SUB16rm, 0 }, + { X86::SUB32rr, X86::SUB32rm, 0 }, + { X86::SUB64rr, X86::SUB64rm, 0 }, + { X86::SUB8rr, X86::SUB8rm, 0 }, + { X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 }, + { X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 }, + { X86::SUBSDrr, X86::SUBSDrm, 0 }, + { X86::SUBSDrr_Int, X86::SUBSDrm_Int, 0 }, + { X86::SUBSSrr, X86::SUBSSrm, 0 }, + { X86::SUBSSrr_Int, X86::SUBSSrm_Int, 0 }, + // FIXME: TEST*rr -> swapped operand of TEST*mr. + { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 }, + { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 }, + { X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 }, + { X86::UNPCKLPSrr, X86::UNPCKLPSrm, TB_ALIGN_16 }, + { X86::XOR16rr, X86::XOR16rm, 0 }, + { X86::XOR32rr, X86::XOR32rm, 0 }, + { X86::XOR64rr, X86::XOR64rm, 0 }, + { X86::XOR8rr, X86::XOR8rm, 0 }, + { X86::XORPDrr, X86::XORPDrm, TB_ALIGN_16 }, + { X86::XORPSrr, X86::XORPSrm, TB_ALIGN_16 }, + + // MMX version of foldable instructions + { X86::MMX_CVTPI2PSirr, X86::MMX_CVTPI2PSirm, 0 }, + { X86::MMX_PACKSSDWirr, X86::MMX_PACKSSDWirm, 0 }, + { X86::MMX_PACKSSWBirr, X86::MMX_PACKSSWBirm, 0 }, + { X86::MMX_PACKUSWBirr, X86::MMX_PACKUSWBirm, 0 }, + { X86::MMX_PADDBirr, X86::MMX_PADDBirm, 0 }, + { X86::MMX_PADDDirr, X86::MMX_PADDDirm, 0 }, + { X86::MMX_PADDQirr, X86::MMX_PADDQirm, 0 }, + { X86::MMX_PADDSBirr, X86::MMX_PADDSBirm, 0 }, + { X86::MMX_PADDSWirr, X86::MMX_PADDSWirm, 0 }, + { X86::MMX_PADDUSBirr, X86::MMX_PADDUSBirm, 0 }, + { X86::MMX_PADDUSWirr, X86::MMX_PADDUSWirm, 0 }, + { X86::MMX_PADDWirr, X86::MMX_PADDWirm, 0 }, + { X86::MMX_PALIGNR64irr, X86::MMX_PALIGNR64irm, 0 }, + { X86::MMX_PANDNirr, X86::MMX_PANDNirm, 0 }, + { X86::MMX_PANDirr, X86::MMX_PANDirm, 0 }, + { X86::MMX_PAVGBirr, X86::MMX_PAVGBirm, 0 }, + { X86::MMX_PAVGWirr, X86::MMX_PAVGWirm, 0 }, + { X86::MMX_PCMPEQBirr, X86::MMX_PCMPEQBirm, 0 }, + { X86::MMX_PCMPEQDirr, X86::MMX_PCMPEQDirm, 0 }, + { X86::MMX_PCMPEQWirr, X86::MMX_PCMPEQWirm, 0 }, + { X86::MMX_PCMPGTBirr, X86::MMX_PCMPGTBirm, 0 }, + { X86::MMX_PCMPGTDirr, X86::MMX_PCMPGTDirm, 0 }, + { X86::MMX_PCMPGTWirr, X86::MMX_PCMPGTWirm, 0 }, + { X86::MMX_PHADDSWrr64, X86::MMX_PHADDSWrm64, 0 }, + { X86::MMX_PHADDWrr64, X86::MMX_PHADDWrm64, 0 }, + { X86::MMX_PHADDrr64, X86::MMX_PHADDrm64, 0 }, + { X86::MMX_PHSUBDrr64, X86::MMX_PHSUBDrm64, 0 }, + { X86::MMX_PHSUBSWrr64, X86::MMX_PHSUBSWrm64, 0 }, + { X86::MMX_PHSUBWrr64, X86::MMX_PHSUBWrm64, 0 }, + { X86::MMX_PINSRWirri, X86::MMX_PINSRWirmi, 0 }, + { X86::MMX_PMADDUBSWrr64, X86::MMX_PMADDUBSWrm64, 0 }, + { X86::MMX_PMADDWDirr, X86::MMX_PMADDWDirm, 0 }, + { X86::MMX_PMAXSWirr, X86::MMX_PMAXSWirm, 0 }, + { X86::MMX_PMAXUBirr, X86::MMX_PMAXUBirm, 0 }, + { X86::MMX_PMINSWirr, X86::MMX_PMINSWirm, 0 }, + { X86::MMX_PMINUBirr, X86::MMX_PMINUBirm, 0 }, + { X86::MMX_PMULHRSWrr64, X86::MMX_PMULHRSWrm64, 0 }, + { X86::MMX_PMULHUWirr, X86::MMX_PMULHUWirm, 0 }, + { X86::MMX_PMULHWirr, X86::MMX_PMULHWirm, 0 }, + { X86::MMX_PMULLWirr, X86::MMX_PMULLWirm, 0 }, + { X86::MMX_PMULUDQirr, X86::MMX_PMULUDQirm, 0 }, + { X86::MMX_PORirr, X86::MMX_PORirm, 0 }, + { X86::MMX_PSADBWirr, X86::MMX_PSADBWirm, 0 }, + { X86::MMX_PSHUFBrr64, X86::MMX_PSHUFBrm64, 0 }, + { X86::MMX_PSIGNBrr64, X86::MMX_PSIGNBrm64, 0 }, + { X86::MMX_PSIGNDrr64, X86::MMX_PSIGNDrm64, 0 }, + { X86::MMX_PSIGNWrr64, X86::MMX_PSIGNWrm64, 0 }, + { X86::MMX_PSLLDrr, X86::MMX_PSLLDrm, 0 }, + { X86::MMX_PSLLQrr, X86::MMX_PSLLQrm, 0 }, + { X86::MMX_PSLLWrr, X86::MMX_PSLLWrm, 0 }, + { X86::MMX_PSRADrr, X86::MMX_PSRADrm, 0 }, + { X86::MMX_PSRAWrr, X86::MMX_PSRAWrm, 0 }, + { X86::MMX_PSRLDrr, X86::MMX_PSRLDrm, 0 }, + { X86::MMX_PSRLQrr, X86::MMX_PSRLQrm, 0 }, + { X86::MMX_PSRLWrr, X86::MMX_PSRLWrm, 0 }, + { X86::MMX_PSUBBirr, X86::MMX_PSUBBirm, 0 }, + { X86::MMX_PSUBDirr, X86::MMX_PSUBDirm, 0 }, + { X86::MMX_PSUBQirr, X86::MMX_PSUBQirm, 0 }, + { X86::MMX_PSUBSBirr, X86::MMX_PSUBSBirm, 0 }, + { X86::MMX_PSUBSWirr, X86::MMX_PSUBSWirm, 0 }, + { X86::MMX_PSUBUSBirr, X86::MMX_PSUBUSBirm, 0 }, + { X86::MMX_PSUBUSWirr, X86::MMX_PSUBUSWirm, 0 }, + { X86::MMX_PSUBWirr, X86::MMX_PSUBWirm, 0 }, + { X86::MMX_PUNPCKHBWirr, X86::MMX_PUNPCKHBWirm, 0 }, + { X86::MMX_PUNPCKHDQirr, X86::MMX_PUNPCKHDQirm, 0 }, + { X86::MMX_PUNPCKHWDirr, X86::MMX_PUNPCKHWDirm, 0 }, + { X86::MMX_PUNPCKLBWirr, X86::MMX_PUNPCKLBWirm, 0 }, + { X86::MMX_PUNPCKLDQirr, X86::MMX_PUNPCKLDQirm, 0 }, + { X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, 0 }, + { X86::MMX_PXORirr, X86::MMX_PXORirm, 0 }, + + // 3DNow! version of foldable instructions + { X86::PAVGUSBrr, X86::PAVGUSBrm, 0 }, + { X86::PFACCrr, X86::PFACCrm, 0 }, + { X86::PFADDrr, X86::PFADDrm, 0 }, + { X86::PFCMPEQrr, X86::PFCMPEQrm, 0 }, + { X86::PFCMPGErr, X86::PFCMPGErm, 0 }, + { X86::PFCMPGTrr, X86::PFCMPGTrm, 0 }, + { X86::PFMAXrr, X86::PFMAXrm, 0 }, + { X86::PFMINrr, X86::PFMINrm, 0 }, + { X86::PFMULrr, X86::PFMULrm, 0 }, + { X86::PFNACCrr, X86::PFNACCrm, 0 }, + { X86::PFPNACCrr, X86::PFPNACCrm, 0 }, + { X86::PFRCPIT1rr, X86::PFRCPIT1rm, 0 }, + { X86::PFRCPIT2rr, X86::PFRCPIT2rm, 0 }, + { X86::PFRSQIT1rr, X86::PFRSQIT1rm, 0 }, + { X86::PFSUBrr, X86::PFSUBrm, 0 }, + { X86::PFSUBRrr, X86::PFSUBRrm, 0 }, + { X86::PMULHRWrr, X86::PMULHRWrm, 0 }, + + // AVX 128-bit versions of foldable instructions + { X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 }, + { X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, 0 }, + { X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 }, + { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 }, + { X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 }, + { X86::Int_VCVTSI2SDrr, X86::Int_VCVTSI2SDrm, 0 }, + { X86::VCVTSI2SS64rr, X86::VCVTSI2SS64rm, 0 }, + { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm, 0 }, + { X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 }, + { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 }, + { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 }, + { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 }, + { X86::VRCPSSr, X86::VRCPSSm, 0 }, + { X86::VRCPSSr_Int, X86::VRCPSSm_Int, 0 }, + { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 }, + { X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, 0 }, + { X86::VSQRTSDr, X86::VSQRTSDm, 0 }, + { X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, 0 }, + { X86::VSQRTSSr, X86::VSQRTSSm, 0 }, + { X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, 0 }, + { X86::VADDPDrr, X86::VADDPDrm, 0 }, + { X86::VADDPSrr, X86::VADDPSrm, 0 }, + { X86::VADDSDrr, X86::VADDSDrm, 0 }, + { X86::VADDSDrr_Int, X86::VADDSDrm_Int, 0 }, + { X86::VADDSSrr, X86::VADDSSrm, 0 }, + { X86::VADDSSrr_Int, X86::VADDSSrm_Int, 0 }, + { X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 }, + { X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 }, + { X86::VANDNPDrr, X86::VANDNPDrm, 0 }, + { X86::VANDNPSrr, X86::VANDNPSrm, 0 }, + { X86::VANDPDrr, X86::VANDPDrm, 0 }, + { X86::VANDPSrr, X86::VANDPSrm, 0 }, + { X86::VBLENDPDrri, X86::VBLENDPDrmi, 0 }, + { X86::VBLENDPSrri, X86::VBLENDPSrmi, 0 }, + { X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0 }, + { X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0 }, + { X86::VCMPPDrri, X86::VCMPPDrmi, 0 }, + { X86::VCMPPSrri, X86::VCMPPSrmi, 0 }, + { X86::VCMPSDrr, X86::VCMPSDrm, 0 }, + { X86::VCMPSSrr, X86::VCMPSSrm, 0 }, + { X86::VDIVPDrr, X86::VDIVPDrm, 0 }, + { X86::VDIVPSrr, X86::VDIVPSrm, 0 }, + { X86::VDIVSDrr, X86::VDIVSDrm, 0 }, + { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, 0 }, + { X86::VDIVSSrr, X86::VDIVSSrm, 0 }, + { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, 0 }, + { X86::VDPPDrri, X86::VDPPDrmi, 0 }, + { X86::VDPPSrri, X86::VDPPSrmi, 0 }, + // Do not fold VFs* loads because there are no scalar load variants for + // these instructions. When folded, the load is required to be 128-bits, so + // the load size would not match. + { X86::VFvANDNPDrr, X86::VFvANDNPDrm, 0 }, + { X86::VFvANDNPSrr, X86::VFvANDNPSrm, 0 }, + { X86::VFvANDPDrr, X86::VFvANDPDrm, 0 }, + { X86::VFvANDPSrr, X86::VFvANDPSrm, 0 }, + { X86::VFvORPDrr, X86::VFvORPDrm, 0 }, + { X86::VFvORPSrr, X86::VFvORPSrm, 0 }, + { X86::VFvXORPDrr, X86::VFvXORPDrm, 0 }, + { X86::VFvXORPSrr, X86::VFvXORPSrm, 0 }, + { X86::VHADDPDrr, X86::VHADDPDrm, 0 }, + { X86::VHADDPSrr, X86::VHADDPSrm, 0 }, + { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 }, + { X86::VHSUBPSrr, X86::VHSUBPSrm, 0 }, + { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, 0 }, + { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, 0 }, + { X86::VMAXPDrr, X86::VMAXPDrm, 0 }, + { X86::VMAXPSrr, X86::VMAXPSrm, 0 }, + { X86::VMAXSDrr, X86::VMAXSDrm, 0 }, + { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, 0 }, + { X86::VMAXSSrr, X86::VMAXSSrm, 0 }, + { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, 0 }, + { X86::VMINPDrr, X86::VMINPDrm, 0 }, + { X86::VMINPSrr, X86::VMINPSrm, 0 }, + { X86::VMINSDrr, X86::VMINSDrm, 0 }, + { X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 }, + { X86::VMINSSrr, X86::VMINSSrm, 0 }, + { X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 }, + { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 }, + { X86::VMULPDrr, X86::VMULPDrm, 0 }, + { X86::VMULPSrr, X86::VMULPSrm, 0 }, + { X86::VMULSDrr, X86::VMULSDrm, 0 }, + { X86::VMULSDrr_Int, X86::VMULSDrm_Int, 0 }, + { X86::VMULSSrr, X86::VMULSSrm, 0 }, + { X86::VMULSSrr_Int, X86::VMULSSrm_Int, 0 }, + { X86::VORPDrr, X86::VORPDrm, 0 }, + { X86::VORPSrr, X86::VORPSrm, 0 }, + { X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 }, + { X86::VPACKSSWBrr, X86::VPACKSSWBrm, 0 }, + { X86::VPACKUSDWrr, X86::VPACKUSDWrm, 0 }, + { X86::VPACKUSWBrr, X86::VPACKUSWBrm, 0 }, + { X86::VPADDBrr, X86::VPADDBrm, 0 }, + { X86::VPADDDrr, X86::VPADDDrm, 0 }, + { X86::VPADDQrr, X86::VPADDQrm, 0 }, + { X86::VPADDSBrr, X86::VPADDSBrm, 0 }, + { X86::VPADDSWrr, X86::VPADDSWrm, 0 }, + { X86::VPADDUSBrr, X86::VPADDUSBrm, 0 }, + { X86::VPADDUSWrr, X86::VPADDUSWrm, 0 }, + { X86::VPADDWrr, X86::VPADDWrm, 0 }, + { X86::VPALIGNR128rr, X86::VPALIGNR128rm, 0 }, + { X86::VPANDNrr, X86::VPANDNrm, 0 }, + { X86::VPANDrr, X86::VPANDrm, 0 }, + { X86::VPAVGBrr, X86::VPAVGBrm, 0 }, + { X86::VPAVGWrr, X86::VPAVGWrm, 0 }, + { X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0 }, + { X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 }, + { X86::VPCLMULQDQrr, X86::VPCLMULQDQrm, 0 }, + { X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 }, + { X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 }, + { X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 }, + { X86::VPCMPEQWrr, X86::VPCMPEQWrm, 0 }, + { X86::VPCMPGTBrr, X86::VPCMPGTBrm, 0 }, + { X86::VPCMPGTDrr, X86::VPCMPGTDrm, 0 }, + { X86::VPCMPGTQrr, X86::VPCMPGTQrm, 0 }, + { X86::VPCMPGTWrr, X86::VPCMPGTWrm, 0 }, + { X86::VPHADDDrr, X86::VPHADDDrm, 0 }, + { X86::VPHADDSWrr128, X86::VPHADDSWrm128, 0 }, + { X86::VPHADDWrr, X86::VPHADDWrm, 0 }, + { X86::VPHSUBDrr, X86::VPHSUBDrm, 0 }, + { X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, 0 }, + { X86::VPHSUBWrr, X86::VPHSUBWrm, 0 }, + { X86::VPERMILPDrr, X86::VPERMILPDrm, 0 }, + { X86::VPERMILPSrr, X86::VPERMILPSrm, 0 }, + { X86::VPINSRBrr, X86::VPINSRBrm, 0 }, + { X86::VPINSRDrr, X86::VPINSRDrm, 0 }, + { X86::VPINSRQrr, X86::VPINSRQrm, 0 }, + { X86::VPINSRWrri, X86::VPINSRWrmi, 0 }, + { X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, 0 }, + { X86::VPMADDWDrr, X86::VPMADDWDrm, 0 }, + { X86::VPMAXSWrr, X86::VPMAXSWrm, 0 }, + { X86::VPMAXUBrr, X86::VPMAXUBrm, 0 }, + { X86::VPMINSWrr, X86::VPMINSWrm, 0 }, + { X86::VPMINUBrr, X86::VPMINUBrm, 0 }, + { X86::VPMINSBrr, X86::VPMINSBrm, 0 }, + { X86::VPMINSDrr, X86::VPMINSDrm, 0 }, + { X86::VPMINUDrr, X86::VPMINUDrm, 0 }, + { X86::VPMINUWrr, X86::VPMINUWrm, 0 }, + { X86::VPMAXSBrr, X86::VPMAXSBrm, 0 }, + { X86::VPMAXSDrr, X86::VPMAXSDrm, 0 }, + { X86::VPMAXUDrr, X86::VPMAXUDrm, 0 }, + { X86::VPMAXUWrr, X86::VPMAXUWrm, 0 }, + { X86::VPMULDQrr, X86::VPMULDQrm, 0 }, + { X86::VPMULHRSWrr128, X86::VPMULHRSWrm128, 0 }, + { X86::VPMULHUWrr, X86::VPMULHUWrm, 0 }, + { X86::VPMULHWrr, X86::VPMULHWrm, 0 }, + { X86::VPMULLDrr, X86::VPMULLDrm, 0 }, + { X86::VPMULLWrr, X86::VPMULLWrm, 0 }, + { X86::VPMULUDQrr, X86::VPMULUDQrm, 0 }, + { X86::VPORrr, X86::VPORrm, 0 }, + { X86::VPSADBWrr, X86::VPSADBWrm, 0 }, + { X86::VPSHUFBrr, X86::VPSHUFBrm, 0 }, + { X86::VPSIGNBrr, X86::VPSIGNBrm, 0 }, + { X86::VPSIGNWrr, X86::VPSIGNWrm, 0 }, + { X86::VPSIGNDrr, X86::VPSIGNDrm, 0 }, + { X86::VPSLLDrr, X86::VPSLLDrm, 0 }, + { X86::VPSLLQrr, X86::VPSLLQrm, 0 }, + { X86::VPSLLWrr, X86::VPSLLWrm, 0 }, + { X86::VPSRADrr, X86::VPSRADrm, 0 }, + { X86::VPSRAWrr, X86::VPSRAWrm, 0 }, + { X86::VPSRLDrr, X86::VPSRLDrm, 0 }, + { X86::VPSRLQrr, X86::VPSRLQrm, 0 }, + { X86::VPSRLWrr, X86::VPSRLWrm, 0 }, + { X86::VPSUBBrr, X86::VPSUBBrm, 0 }, + { X86::VPSUBDrr, X86::VPSUBDrm, 0 }, + { X86::VPSUBQrr, X86::VPSUBQrm, 0 }, + { X86::VPSUBSBrr, X86::VPSUBSBrm, 0 }, + { X86::VPSUBSWrr, X86::VPSUBSWrm, 0 }, + { X86::VPSUBUSBrr, X86::VPSUBUSBrm, 0 }, + { X86::VPSUBUSWrr, X86::VPSUBUSWrm, 0 }, + { X86::VPSUBWrr, X86::VPSUBWrm, 0 }, + { X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 }, + { X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 }, + { X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, 0 }, + { X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, 0 }, + { X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, 0 }, + { X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, 0 }, + { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 }, + { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 }, + { X86::VPXORrr, X86::VPXORrm, 0 }, + { X86::VROUNDSDr, X86::VROUNDSDm, 0 }, + { X86::VROUNDSSr, X86::VROUNDSSm, 0 }, + { X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 }, + { X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 }, + { X86::VSUBPDrr, X86::VSUBPDrm, 0 }, + { X86::VSUBPSrr, X86::VSUBPSrm, 0 }, + { X86::VSUBSDrr, X86::VSUBSDrm, 0 }, + { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, 0 }, + { X86::VSUBSSrr, X86::VSUBSSrm, 0 }, + { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, 0 }, + { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 }, + { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 }, + { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 }, + { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, 0 }, + { X86::VXORPDrr, X86::VXORPDrm, 0 }, + { X86::VXORPSrr, X86::VXORPSrm, 0 }, + + // AVX 256-bit foldable instructions + { X86::VADDPDYrr, X86::VADDPDYrm, 0 }, + { X86::VADDPSYrr, X86::VADDPSYrm, 0 }, + { X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, 0 }, + { X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, 0 }, + { X86::VANDNPDYrr, X86::VANDNPDYrm, 0 }, + { X86::VANDNPSYrr, X86::VANDNPSYrm, 0 }, + { X86::VANDPDYrr, X86::VANDPDYrm, 0 }, + { X86::VANDPSYrr, X86::VANDPSYrm, 0 }, + { X86::VBLENDPDYrri, X86::VBLENDPDYrmi, 0 }, + { X86::VBLENDPSYrri, X86::VBLENDPSYrmi, 0 }, + { X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, 0 }, + { X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0 }, + { X86::VCMPPDYrri, X86::VCMPPDYrmi, 0 }, + { X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 }, + { X86::VDIVPDYrr, X86::VDIVPDYrm, 0 }, + { X86::VDIVPSYrr, X86::VDIVPSYrm, 0 }, + { X86::VDPPSYrri, X86::VDPPSYrmi, 0 }, + { X86::VHADDPDYrr, X86::VHADDPDYrm, 0 }, + { X86::VHADDPSYrr, X86::VHADDPSYrm, 0 }, + { X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 }, + { X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 }, + { X86::VINSERTF128rr, X86::VINSERTF128rm, 0 }, + { X86::VMAXPDYrr, X86::VMAXPDYrm, 0 }, + { X86::VMAXPSYrr, X86::VMAXPSYrm, 0 }, + { X86::VMINPDYrr, X86::VMINPDYrm, 0 }, + { X86::VMINPSYrr, X86::VMINPSYrm, 0 }, + { X86::VMULPDYrr, X86::VMULPDYrm, 0 }, + { X86::VMULPSYrr, X86::VMULPSYrm, 0 }, + { X86::VORPDYrr, X86::VORPDYrm, 0 }, + { X86::VORPSYrr, X86::VORPSYrm, 0 }, + { X86::VPERM2F128rr, X86::VPERM2F128rm, 0 }, + { X86::VPERMILPDYrr, X86::VPERMILPDYrm, 0 }, + { X86::VPERMILPSYrr, X86::VPERMILPSYrm, 0 }, + { X86::VSHUFPDYrri, X86::VSHUFPDYrmi, 0 }, + { X86::VSHUFPSYrri, X86::VSHUFPSYrmi, 0 }, + { X86::VSUBPDYrr, X86::VSUBPDYrm, 0 }, + { X86::VSUBPSYrr, X86::VSUBPSYrm, 0 }, + { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, 0 }, + { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, 0 }, + { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, 0 }, + { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, 0 }, + { X86::VXORPDYrr, X86::VXORPDYrm, 0 }, + { X86::VXORPSYrr, X86::VXORPSYrm, 0 }, + + // AVX2 foldable instructions + { X86::VINSERTI128rr, X86::VINSERTI128rm, 0 }, + { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, 0 }, + { X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, 0 }, + { X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, 0 }, + { X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, 0 }, + { X86::VPADDBYrr, X86::VPADDBYrm, 0 }, + { X86::VPADDDYrr, X86::VPADDDYrm, 0 }, + { X86::VPADDQYrr, X86::VPADDQYrm, 0 }, + { X86::VPADDSBYrr, X86::VPADDSBYrm, 0 }, + { X86::VPADDSWYrr, X86::VPADDSWYrm, 0 }, + { X86::VPADDUSBYrr, X86::VPADDUSBYrm, 0 }, + { X86::VPADDUSWYrr, X86::VPADDUSWYrm, 0 }, + { X86::VPADDWYrr, X86::VPADDWYrm, 0 }, + { X86::VPALIGNR256rr, X86::VPALIGNR256rm, 0 }, + { X86::VPANDNYrr, X86::VPANDNYrm, 0 }, + { X86::VPANDYrr, X86::VPANDYrm, 0 }, + { X86::VPAVGBYrr, X86::VPAVGBYrm, 0 }, + { X86::VPAVGWYrr, X86::VPAVGWYrm, 0 }, + { X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 }, + { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 }, + { X86::VPBLENDVBYrr, X86::VPBLENDVBYrm, 0 }, + { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 }, + { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 }, + { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 }, + { X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, 0 }, + { X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, 0 }, + { X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, 0 }, + { X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, 0 }, + { X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, 0 }, + { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 }, + { X86::VPERM2I128rr, X86::VPERM2I128rm, 0 }, + { X86::VPERMDYrr, X86::VPERMDYrm, 0 }, + { X86::VPERMPSYrr, X86::VPERMPSYrm, 0 }, + { X86::VPHADDDYrr, X86::VPHADDDYrm, 0 }, + { X86::VPHADDSWrr256, X86::VPHADDSWrm256, 0 }, + { X86::VPHADDWYrr, X86::VPHADDWYrm, 0 }, + { X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 }, + { X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, 0 }, + { X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 }, + { X86::VPMADDUBSWrr256, X86::VPMADDUBSWrm256, 0 }, + { X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 }, + { X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 }, + { X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 }, + { X86::VPMINSWYrr, X86::VPMINSWYrm, 0 }, + { X86::VPMINUBYrr, X86::VPMINUBYrm, 0 }, + { X86::VPMINSBYrr, X86::VPMINSBYrm, 0 }, + { X86::VPMINSDYrr, X86::VPMINSDYrm, 0 }, + { X86::VPMINUDYrr, X86::VPMINUDYrm, 0 }, + { X86::VPMINUWYrr, X86::VPMINUWYrm, 0 }, + { X86::VPMAXSBYrr, X86::VPMAXSBYrm, 0 }, + { X86::VPMAXSDYrr, X86::VPMAXSDYrm, 0 }, + { X86::VPMAXUDYrr, X86::VPMAXUDYrm, 0 }, + { X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 }, + { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 }, + { X86::VPMULDQYrr, X86::VPMULDQYrm, 0 }, + { X86::VPMULHRSWrr256, X86::VPMULHRSWrm256, 0 }, + { X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 }, + { X86::VPMULHWYrr, X86::VPMULHWYrm, 0 }, + { X86::VPMULLDYrr, X86::VPMULLDYrm, 0 }, + { X86::VPMULLWYrr, X86::VPMULLWYrm, 0 }, + { X86::VPMULUDQYrr, X86::VPMULUDQYrm, 0 }, + { X86::VPORYrr, X86::VPORYrm, 0 }, + { X86::VPSADBWYrr, X86::VPSADBWYrm, 0 }, + { X86::VPSHUFBYrr, X86::VPSHUFBYrm, 0 }, + { X86::VPSIGNBYrr, X86::VPSIGNBYrm, 0 }, + { X86::VPSIGNWYrr, X86::VPSIGNWYrm, 0 }, + { X86::VPSIGNDYrr, X86::VPSIGNDYrm, 0 }, + { X86::VPSLLDYrr, X86::VPSLLDYrm, 0 }, + { X86::VPSLLQYrr, X86::VPSLLQYrm, 0 }, + { X86::VPSLLWYrr, X86::VPSLLWYrm, 0 }, + { X86::VPSLLVDrr, X86::VPSLLVDrm, 0 }, + { X86::VPSLLVDYrr, X86::VPSLLVDYrm, 0 }, + { X86::VPSLLVQrr, X86::VPSLLVQrm, 0 }, + { X86::VPSLLVQYrr, X86::VPSLLVQYrm, 0 }, + { X86::VPSRADYrr, X86::VPSRADYrm, 0 }, + { X86::VPSRAWYrr, X86::VPSRAWYrm, 0 }, + { X86::VPSRAVDrr, X86::VPSRAVDrm, 0 }, + { X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 }, + { X86::VPSRLDYrr, X86::VPSRLDYrm, 0 }, + { X86::VPSRLQYrr, X86::VPSRLQYrm, 0 }, + { X86::VPSRLWYrr, X86::VPSRLWYrm, 0 }, + { X86::VPSRLVDrr, X86::VPSRLVDrm, 0 }, + { X86::VPSRLVDYrr, X86::VPSRLVDYrm, 0 }, + { X86::VPSRLVQrr, X86::VPSRLVQrm, 0 }, + { X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 }, + { X86::VPSUBBYrr, X86::VPSUBBYrm, 0 }, + { X86::VPSUBDYrr, X86::VPSUBDYrm, 0 }, + { X86::VPSUBQYrr, X86::VPSUBQYrm, 0 }, + { X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 }, + { X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 }, + { X86::VPSUBUSBYrr, X86::VPSUBUSBYrm, 0 }, + { X86::VPSUBUSWYrr, X86::VPSUBUSWYrm, 0 }, + { X86::VPSUBWYrr, X86::VPSUBWYrm, 0 }, + { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 }, + { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 }, + { X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, 0 }, + { X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, 0 }, + { X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, 0 }, + { X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, 0 }, + { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 }, + { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 }, + { X86::VPXORYrr, X86::VPXORYrm, 0 }, + + // FMA4 foldable patterns + { X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_NONE }, + { X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_NONE }, + { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_NONE }, + { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_NONE }, + { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, TB_ALIGN_NONE }, + { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, TB_ALIGN_NONE }, + { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, TB_ALIGN_NONE }, + { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, TB_ALIGN_NONE }, + { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_NONE }, + { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_NONE }, + { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, TB_ALIGN_NONE }, + { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, TB_ALIGN_NONE }, + { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_NONE }, + { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_NONE }, + { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_NONE }, + { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_NONE }, + { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, TB_ALIGN_NONE }, + { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, TB_ALIGN_NONE }, + { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, TB_ALIGN_NONE }, + { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, TB_ALIGN_NONE }, + { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_NONE }, + { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_NONE }, + { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, TB_ALIGN_NONE }, + { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_NONE }, + { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_NONE }, + { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_NONE }, + { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_NONE }, + { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_NONE }, + + // XOP foldable instructions + { X86::VPCMOVrr, X86::VPCMOVmr, 0 }, + { X86::VPCMOVrrY, X86::VPCMOVmrY, 0 }, + { X86::VPCOMBri, X86::VPCOMBmi, 0 }, + { X86::VPCOMDri, X86::VPCOMDmi, 0 }, + { X86::VPCOMQri, X86::VPCOMQmi, 0 }, + { X86::VPCOMWri, X86::VPCOMWmi, 0 }, + { X86::VPCOMUBri, X86::VPCOMUBmi, 0 }, + { X86::VPCOMUDri, X86::VPCOMUDmi, 0 }, + { X86::VPCOMUQri, X86::VPCOMUQmi, 0 }, + { X86::VPCOMUWri, X86::VPCOMUWmi, 0 }, + { X86::VPERMIL2PDrr, X86::VPERMIL2PDmr, 0 }, + { X86::VPERMIL2PDrrY, X86::VPERMIL2PDmrY, 0 }, + { X86::VPERMIL2PSrr, X86::VPERMIL2PSmr, 0 }, + { X86::VPERMIL2PSrrY, X86::VPERMIL2PSmrY, 0 }, + { X86::VPMACSDDrr, X86::VPMACSDDrm, 0 }, + { X86::VPMACSDQHrr, X86::VPMACSDQHrm, 0 }, + { X86::VPMACSDQLrr, X86::VPMACSDQLrm, 0 }, + { X86::VPMACSSDDrr, X86::VPMACSSDDrm, 0 }, + { X86::VPMACSSDQHrr, X86::VPMACSSDQHrm, 0 }, + { X86::VPMACSSDQLrr, X86::VPMACSSDQLrm, 0 }, + { X86::VPMACSSWDrr, X86::VPMACSSWDrm, 0 }, + { X86::VPMACSSWWrr, X86::VPMACSSWWrm, 0 }, + { X86::VPMACSWDrr, X86::VPMACSWDrm, 0 }, + { X86::VPMACSWWrr, X86::VPMACSWWrm, 0 }, + { X86::VPMADCSSWDrr, X86::VPMADCSSWDrm, 0 }, + { X86::VPMADCSWDrr, X86::VPMADCSWDrm, 0 }, + { X86::VPPERMrr, X86::VPPERMmr, 0 }, + { X86::VPROTBrr, X86::VPROTBrm, 0 }, + { X86::VPROTDrr, X86::VPROTDrm, 0 }, + { X86::VPROTQrr, X86::VPROTQrm, 0 }, + { X86::VPROTWrr, X86::VPROTWrm, 0 }, + { X86::VPSHABrr, X86::VPSHABrm, 0 }, + { X86::VPSHADrr, X86::VPSHADrm, 0 }, + { X86::VPSHAQrr, X86::VPSHAQrm, 0 }, + { X86::VPSHAWrr, X86::VPSHAWrm, 0 }, + { X86::VPSHLBrr, X86::VPSHLBrm, 0 }, + { X86::VPSHLDrr, X86::VPSHLDrm, 0 }, + { X86::VPSHLQrr, X86::VPSHLQrm, 0 }, + { X86::VPSHLWrr, X86::VPSHLWrm, 0 }, + + // BMI/BMI2 foldable instructions + { X86::ANDN32rr, X86::ANDN32rm, 0 }, + { X86::ANDN64rr, X86::ANDN64rm, 0 }, + { X86::MULX32rr, X86::MULX32rm, 0 }, + { X86::MULX64rr, X86::MULX64rm, 0 }, + { X86::PDEP32rr, X86::PDEP32rm, 0 }, + { X86::PDEP64rr, X86::PDEP64rm, 0 }, + { X86::PEXT32rr, X86::PEXT32rm, 0 }, + { X86::PEXT64rr, X86::PEXT64rm, 0 }, + + // ADX foldable instructions + { X86::ADCX32rr, X86::ADCX32rm, 0 }, + { X86::ADCX64rr, X86::ADCX64rm, 0 }, + { X86::ADOX32rr, X86::ADOX32rm, 0 }, + { X86::ADOX64rr, X86::ADOX64rm, 0 }, + + // AVX-512 foldable instructions + { X86::VADDPSZrr, X86::VADDPSZrm, 0 }, + { X86::VADDPDZrr, X86::VADDPDZrm, 0 }, + { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 }, + { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 }, + { X86::VMULPSZrr, X86::VMULPSZrm, 0 }, + { X86::VMULPDZrr, X86::VMULPDZrm, 0 }, + { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 }, + { X86::VDIVPDZrr, X86::VDIVPDZrm, 0 }, + { X86::VMINPSZrr, X86::VMINPSZrm, 0 }, + { X86::VMINPDZrr, X86::VMINPDZrm, 0 }, + { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 }, + { X86::VMAXPDZrr, X86::VMAXPDZrm, 0 }, + { X86::VPADDDZrr, X86::VPADDDZrm, 0 }, + { X86::VPADDQZrr, X86::VPADDQZrm, 0 }, + { X86::VPERMPDZri, X86::VPERMPDZmi, 0 }, + { X86::VPERMPSZrr, X86::VPERMPSZrm, 0 }, + { X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 }, + { X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 }, + { X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 }, + { X86::VPMAXUQZrr, X86::VPMAXUQZrm, 0 }, + { X86::VPMINSDZrr, X86::VPMINSDZrm, 0 }, + { X86::VPMINSQZrr, X86::VPMINSQZrm, 0 }, + { X86::VPMINUDZrr, X86::VPMINUDZrm, 0 }, + { X86::VPMINUQZrr, X86::VPMINUQZrm, 0 }, + { X86::VPMULDQZrr, X86::VPMULDQZrm, 0 }, + { X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 }, + { X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 }, + { X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 }, + { X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 }, + { X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 }, + { X86::VPSUBDZrr, X86::VPSUBDZrm, 0 }, + { X86::VPSUBQZrr, X86::VPSUBQZrm, 0 }, + { X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 }, + { X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 }, + { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 }, + { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 }, + { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 }, + { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE }, + { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE }, + + // AVX-512{F,VL} foldable instructions + { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE }, + + // AVX-512{F,VL} foldable instructions + { X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0 }, + { X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 }, + { X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 }, + { X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 }, + + // AES foldable instructions + { X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 }, + { X86::AESDECrr, X86::AESDECrm, TB_ALIGN_16 }, + { X86::AESENCLASTrr, X86::AESENCLASTrm, TB_ALIGN_16 }, + { X86::AESENCrr, X86::AESENCrm, TB_ALIGN_16 }, + { X86::VAESDECLASTrr, X86::VAESDECLASTrm, 0 }, + { X86::VAESDECrr, X86::VAESDECrm, 0 }, + { X86::VAESENCLASTrr, X86::VAESENCLASTrm, 0 }, + { X86::VAESENCrr, X86::VAESENCrm, 0 }, + + // SHA foldable instructions + { X86::SHA1MSG1rr, X86::SHA1MSG1rm, TB_ALIGN_16 }, + { X86::SHA1MSG2rr, X86::SHA1MSG2rm, TB_ALIGN_16 }, + { X86::SHA1NEXTErr, X86::SHA1NEXTErm, TB_ALIGN_16 }, + { X86::SHA1RNDS4rri, X86::SHA1RNDS4rmi, TB_ALIGN_16 }, + { X86::SHA256MSG1rr, X86::SHA256MSG1rm, TB_ALIGN_16 }, + { X86::SHA256MSG2rr, X86::SHA256MSG2rm, TB_ALIGN_16 }, + { X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 } + }; + + for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2) { + AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable, + Entry.RegOp, Entry.MemOp, + // Index 2, folded load + Entry.Flags | TB_INDEX_2 | TB_FOLDED_LOAD); + } + + static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { + // FMA foldable instructions + { X86::VFMADDSSr231r, X86::VFMADDSSr231m, TB_ALIGN_NONE }, + { X86::VFMADDSSr231r_Int, X86::VFMADDSSr231m_Int, TB_ALIGN_NONE }, + { X86::VFMADDSDr231r, X86::VFMADDSDr231m, TB_ALIGN_NONE }, + { X86::VFMADDSDr231r_Int, X86::VFMADDSDr231m_Int, TB_ALIGN_NONE }, + { X86::VFMADDSSr132r, X86::VFMADDSSr132m, TB_ALIGN_NONE }, + { X86::VFMADDSSr132r_Int, X86::VFMADDSSr132m_Int, TB_ALIGN_NONE }, + { X86::VFMADDSDr132r, X86::VFMADDSDr132m, TB_ALIGN_NONE }, + { X86::VFMADDSDr132r_Int, X86::VFMADDSDr132m_Int, TB_ALIGN_NONE }, + { X86::VFMADDSSr213r, X86::VFMADDSSr213m, TB_ALIGN_NONE }, + { X86::VFMADDSSr213r_Int, X86::VFMADDSSr213m_Int, TB_ALIGN_NONE }, + { X86::VFMADDSDr213r, X86::VFMADDSDr213m, TB_ALIGN_NONE }, + { X86::VFMADDSDr213r_Int, X86::VFMADDSDr213m_Int, TB_ALIGN_NONE }, + + { X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_NONE }, + { X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_NONE }, + { X86::VFMADDPSr132r, X86::VFMADDPSr132m, TB_ALIGN_NONE }, + { X86::VFMADDPDr132r, X86::VFMADDPDr132m, TB_ALIGN_NONE }, + { X86::VFMADDPSr213r, X86::VFMADDPSr213m, TB_ALIGN_NONE }, + { X86::VFMADDPDr213r, X86::VFMADDPDr213m, TB_ALIGN_NONE }, + { X86::VFMADDPSr231rY, X86::VFMADDPSr231mY, TB_ALIGN_NONE }, + { X86::VFMADDPDr231rY, X86::VFMADDPDr231mY, TB_ALIGN_NONE }, + { X86::VFMADDPSr132rY, X86::VFMADDPSr132mY, TB_ALIGN_NONE }, + { X86::VFMADDPDr132rY, X86::VFMADDPDr132mY, TB_ALIGN_NONE }, + { X86::VFMADDPSr213rY, X86::VFMADDPSr213mY, TB_ALIGN_NONE }, + { X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_NONE }, + + { X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr231r_Int, X86::VFNMADDSSr231m_Int, TB_ALIGN_NONE }, + { X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr231r_Int, X86::VFNMADDSDr231m_Int, TB_ALIGN_NONE }, + { X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr132m_Int, TB_ALIGN_NONE }, + { X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr132m_Int, TB_ALIGN_NONE }, + { X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr213m_Int, TB_ALIGN_NONE }, + { X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr213m_Int, TB_ALIGN_NONE }, + + { X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_NONE }, + { X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_NONE }, + { X86::VFNMADDPSr132r, X86::VFNMADDPSr132m, TB_ALIGN_NONE }, + { X86::VFNMADDPDr132r, X86::VFNMADDPDr132m, TB_ALIGN_NONE }, + { X86::VFNMADDPSr213r, X86::VFNMADDPSr213m, TB_ALIGN_NONE }, + { X86::VFNMADDPDr213r, X86::VFNMADDPDr213m, TB_ALIGN_NONE }, + { X86::VFNMADDPSr231rY, X86::VFNMADDPSr231mY, TB_ALIGN_NONE }, + { X86::VFNMADDPDr231rY, X86::VFNMADDPDr231mY, TB_ALIGN_NONE }, + { X86::VFNMADDPSr132rY, X86::VFNMADDPSr132mY, TB_ALIGN_NONE }, + { X86::VFNMADDPDr132rY, X86::VFNMADDPDr132mY, TB_ALIGN_NONE }, + { X86::VFNMADDPSr213rY, X86::VFNMADDPSr213mY, TB_ALIGN_NONE }, + { X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_NONE }, + + { X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr231r_Int, X86::VFMSUBSSr231m_Int, TB_ALIGN_NONE }, + { X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr231r_Int, X86::VFMSUBSDr231m_Int, TB_ALIGN_NONE }, + { X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr132m_Int, TB_ALIGN_NONE }, + { X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr132m_Int, TB_ALIGN_NONE }, + { X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr213m_Int, TB_ALIGN_NONE }, + { X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr213m_Int, TB_ALIGN_NONE }, + + { X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_NONE }, + { X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_NONE }, + { X86::VFMSUBPSr132r, X86::VFMSUBPSr132m, TB_ALIGN_NONE }, + { X86::VFMSUBPDr132r, X86::VFMSUBPDr132m, TB_ALIGN_NONE }, + { X86::VFMSUBPSr213r, X86::VFMSUBPSr213m, TB_ALIGN_NONE }, + { X86::VFMSUBPDr213r, X86::VFMSUBPDr213m, TB_ALIGN_NONE }, + { X86::VFMSUBPSr231rY, X86::VFMSUBPSr231mY, TB_ALIGN_NONE }, + { X86::VFMSUBPDr231rY, X86::VFMSUBPDr231mY, TB_ALIGN_NONE }, + { X86::VFMSUBPSr132rY, X86::VFMSUBPSr132mY, TB_ALIGN_NONE }, + { X86::VFMSUBPDr132rY, X86::VFMSUBPDr132mY, TB_ALIGN_NONE }, + { X86::VFMSUBPSr213rY, X86::VFMSUBPSr213mY, TB_ALIGN_NONE }, + { X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_NONE }, + + { X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr231r_Int, X86::VFNMSUBSSr231m_Int, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr231r_Int, X86::VFNMSUBSDr231m_Int, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr132m_Int, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr132m_Int, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr213m_Int, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr213m_Int, TB_ALIGN_NONE }, + + { X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_NONE }, + { X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_NONE }, + { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr132m, TB_ALIGN_NONE }, + { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr132m, TB_ALIGN_NONE }, + { X86::VFNMSUBPSr213r, X86::VFNMSUBPSr213m, TB_ALIGN_NONE }, + { X86::VFNMSUBPDr213r, X86::VFNMSUBPDr213m, TB_ALIGN_NONE }, + { X86::VFNMSUBPSr231rY, X86::VFNMSUBPSr231mY, TB_ALIGN_NONE }, + { X86::VFNMSUBPDr231rY, X86::VFNMSUBPDr231mY, TB_ALIGN_NONE }, + { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr132mY, TB_ALIGN_NONE }, + { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr132mY, TB_ALIGN_NONE }, + { X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr213mY, TB_ALIGN_NONE }, + { X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr213mY, TB_ALIGN_NONE }, + + { X86::VFMADDSUBPSr231r, X86::VFMADDSUBPSr231m, TB_ALIGN_NONE }, + { X86::VFMADDSUBPDr231r, X86::VFMADDSUBPDr231m, TB_ALIGN_NONE }, + { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr132m, TB_ALIGN_NONE }, + { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr132m, TB_ALIGN_NONE }, + { X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr213m, TB_ALIGN_NONE }, + { X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr213m, TB_ALIGN_NONE }, + { X86::VFMADDSUBPSr231rY, X86::VFMADDSUBPSr231mY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPDr231rY, X86::VFMADDSUBPDr231mY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr132mY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr132mY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr213mY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr213mY, TB_ALIGN_NONE }, + + { X86::VFMSUBADDPSr231r, X86::VFMSUBADDPSr231m, TB_ALIGN_NONE }, + { X86::VFMSUBADDPDr231r, X86::VFMSUBADDPDr231m, TB_ALIGN_NONE }, + { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr132m, TB_ALIGN_NONE }, + { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr132m, TB_ALIGN_NONE }, + { X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr213m, TB_ALIGN_NONE }, + { X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr213m, TB_ALIGN_NONE }, + { X86::VFMSUBADDPSr231rY, X86::VFMSUBADDPSr231mY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPDr231rY, X86::VFMSUBADDPDr231mY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr132mY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr132mY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr213mY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_NONE }, + + // FMA4 foldable patterns + { X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_NONE }, + { X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_NONE }, + { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_NONE }, + { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_NONE }, + { X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_NONE }, + { X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_NONE }, + { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, TB_ALIGN_NONE }, + { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, TB_ALIGN_NONE }, + { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_NONE }, + { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_NONE }, + { X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_NONE }, + { X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_NONE }, + { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_NONE }, + { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_NONE }, + { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_NONE }, + { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_NONE }, + { X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_NONE }, + { X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_NONE }, + { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, TB_ALIGN_NONE }, + { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, TB_ALIGN_NONE }, + { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_NONE }, + { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_NONE }, + { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_NONE }, + { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4rmY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_NONE }, + { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_NONE }, + { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4rmY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4rmY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_NONE }, + { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_NONE }, + { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_NONE }, + + // XOP foldable instructions + { X86::VPCMOVrr, X86::VPCMOVrm, 0 }, + { X86::VPCMOVrrY, X86::VPCMOVrmY, 0 }, + { X86::VPERMIL2PDrr, X86::VPERMIL2PDrm, 0 }, + { X86::VPERMIL2PDrrY, X86::VPERMIL2PDrmY, 0 }, + { X86::VPERMIL2PSrr, X86::VPERMIL2PSrm, 0 }, + { X86::VPERMIL2PSrrY, X86::VPERMIL2PSrmY, 0 }, + { X86::VPPERMrr, X86::VPPERMrm, 0 }, + + // AVX-512 VPERMI instructions with 3 source operands. + { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 }, + { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 }, + { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 }, + { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 }, + { X86::VBLENDMPDZrr, X86::VBLENDMPDZrm, 0 }, + { X86::VBLENDMPSZrr, X86::VBLENDMPSZrm, 0 }, + { X86::VPBLENDMDZrr, X86::VPBLENDMDZrm, 0 }, + { X86::VPBLENDMQZrr, X86::VPBLENDMQZrm, 0 }, + { X86::VBROADCASTSSZrk, X86::VBROADCASTSSZmk, TB_NO_REVERSE }, + { X86::VBROADCASTSDZrk, X86::VBROADCASTSDZmk, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE }, + // AVX-512 arithmetic instructions + { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 }, + { X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 }, + { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 }, + { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 }, + { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 }, + { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 }, + { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 }, + { X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 }, + { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 }, + { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 }, + { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 }, + { X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 }, + // AVX-512{F,VL} arithmetic instructions 256-bit + { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 }, + { X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 }, + { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 }, + { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 }, + { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 }, + { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 }, + { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 }, + { X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 }, + { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 }, + { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 }, + { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 }, + { X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 }, + // AVX-512{F,VL} arithmetic instructions 128-bit + { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 }, + { X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 }, + { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 }, + { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 }, + { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 }, + { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 }, + { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 }, + { X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 }, + { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 }, + { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 }, + { X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 }, + { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 } + }; + + for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) { + AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable, + Entry.RegOp, Entry.MemOp, + // Index 3, folded load + Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD); + } + + static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { + // AVX-512 foldable instructions + { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 }, + { X86::VADDPDZrrk, X86::VADDPDZrmk, 0 }, + { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 }, + { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 }, + { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 }, + { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 }, + { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 }, + { X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 }, + { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 }, + { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 }, + { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 }, + { X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 }, + // AVX-512{F,VL} foldable instructions 256-bit + { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 }, + { X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 }, + { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 }, + { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 }, + { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 }, + { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 }, + { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 }, + { X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 }, + { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 }, + { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 }, + { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 }, + { X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 }, + // AVX-512{F,VL} foldable instructions 128-bit + { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 }, + { X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 }, + { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 }, + { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 }, + { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 }, + { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 }, + { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 }, + { X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 }, + { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 }, + { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 }, + { X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 }, + { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 } + }; + + for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) { + AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable, + Entry.RegOp, Entry.MemOp, + // Index 4, folded load + Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD); + } +} + +void +X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable, + MemOp2RegOpTableType &M2RTable, + unsigned RegOp, unsigned MemOp, unsigned Flags) { + if ((Flags & TB_NO_FORWARD) == 0) { + assert(!R2MTable.count(RegOp) && "Duplicate entry!"); + R2MTable[RegOp] = std::make_pair(MemOp, Flags); + } + if ((Flags & TB_NO_REVERSE) == 0) { + assert(!M2RTable.count(MemOp) && + "Duplicated entries in unfolding maps?"); + M2RTable[MemOp] = std::make_pair(RegOp, Flags); + } +} + +bool +X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SubIdx) const { + switch (MI.getOpcode()) { + default: break; + case X86::MOVSX16rr8: + case X86::MOVZX16rr8: + case X86::MOVSX32rr8: + case X86::MOVZX32rr8: + case X86::MOVSX64rr8: + if (!Subtarget.is64Bit()) + // It's not always legal to reference the low 8-bit of the larger + // register in 32-bit mode. + return false; + case X86::MOVSX32rr16: + case X86::MOVZX32rr16: + case X86::MOVSX64rr16: + case X86::MOVSX64rr32: { + if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg()) + // Be conservative. + return false; + SrcReg = MI.getOperand(1).getReg(); + DstReg = MI.getOperand(0).getReg(); + switch (MI.getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::MOVSX16rr8: + case X86::MOVZX16rr8: + case X86::MOVSX32rr8: + case X86::MOVZX32rr8: + case X86::MOVSX64rr8: + SubIdx = X86::sub_8bit; + break; + case X86::MOVSX32rr16: + case X86::MOVZX32rr16: + case X86::MOVSX64rr16: + SubIdx = X86::sub_16bit; + break; + case X86::MOVSX64rr32: + SubIdx = X86::sub_32bit; + break; + } + return true; + } + } + return false; +} + +int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const { + const MachineFunction *MF = MI->getParent()->getParent(); + const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); + + if (MI->getOpcode() == getCallFrameSetupOpcode() || + MI->getOpcode() == getCallFrameDestroyOpcode()) { + unsigned StackAlign = TFI->getStackAlignment(); + int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign * + StackAlign; + + SPAdj -= MI->getOperand(1).getImm(); + + if (MI->getOpcode() == getCallFrameSetupOpcode()) + return SPAdj; + else + return -SPAdj; + } + + // To know whether a call adjusts the stack, we need information + // that is bound to the following ADJCALLSTACKUP pseudo. + // Look for the next ADJCALLSTACKUP that follows the call. + if (MI->isCall()) { + const MachineBasicBlock* MBB = MI->getParent(); + auto I = ++MachineBasicBlock::const_iterator(MI); + for (auto E = MBB->end(); I != E; ++I) { + if (I->getOpcode() == getCallFrameDestroyOpcode() || + I->isCall()) + break; + } + + // If we could not find a frame destroy opcode, then it has already + // been simplified, so we don't care. + if (I->getOpcode() != getCallFrameDestroyOpcode()) + return 0; + + return -(I->getOperand(1).getImm()); + } + + // Currently handle only PUSHes we can reasonably expect to see + // in call sequences + switch (MI->getOpcode()) { + default: + return 0; + case X86::PUSH32i8: + case X86::PUSH32r: + case X86::PUSH32rmm: + case X86::PUSH32rmr: + case X86::PUSHi32: + return 4; + } +} + +/// Return true and the FrameIndex if the specified +/// operand and follow operands form a reference to the stack frame. +bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op, + int &FrameIndex) const { + if (MI->getOperand(Op+X86::AddrBaseReg).isFI() && + MI->getOperand(Op+X86::AddrScaleAmt).isImm() && + MI->getOperand(Op+X86::AddrIndexReg).isReg() && + MI->getOperand(Op+X86::AddrDisp).isImm() && + MI->getOperand(Op+X86::AddrScaleAmt).getImm() == 1 && + MI->getOperand(Op+X86::AddrIndexReg).getReg() == 0 && + MI->getOperand(Op+X86::AddrDisp).getImm() == 0) { + FrameIndex = MI->getOperand(Op+X86::AddrBaseReg).getIndex(); + return true; + } + return false; +} + +static bool isFrameLoadOpcode(int Opcode) { + switch (Opcode) { + default: + return false; + case X86::MOV8rm: + case X86::MOV16rm: + case X86::MOV32rm: + case X86::MOV64rm: + case X86::LD_Fp64m: + case X86::MOVSSrm: + case X86::MOVSDrm: + case X86::MOVAPSrm: + case X86::MOVAPDrm: + case X86::MOVDQArm: + case X86::VMOVSSrm: + case X86::VMOVSDrm: + case X86::VMOVAPSrm: + case X86::VMOVAPDrm: + case X86::VMOVDQArm: + case X86::VMOVUPSYrm: + case X86::VMOVAPSYrm: + case X86::VMOVUPDYrm: + case X86::VMOVAPDYrm: + case X86::VMOVDQUYrm: + case X86::VMOVDQAYrm: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + case X86::VMOVAPSZrm: + case X86::VMOVUPSZrm: + return true; + } +} + +static bool isFrameStoreOpcode(int Opcode) { + switch (Opcode) { + default: break; + case X86::MOV8mr: + case X86::MOV16mr: + case X86::MOV32mr: + case X86::MOV64mr: + case X86::ST_FpP64m: + case X86::MOVSSmr: + case X86::MOVSDmr: + case X86::MOVAPSmr: + case X86::MOVAPDmr: + case X86::MOVDQAmr: + case X86::VMOVSSmr: + case X86::VMOVSDmr: + case X86::VMOVAPSmr: + case X86::VMOVAPDmr: + case X86::VMOVDQAmr: + case X86::VMOVUPSYmr: + case X86::VMOVAPSYmr: + case X86::VMOVUPDYmr: + case X86::VMOVAPDYmr: + case X86::VMOVDQUYmr: + case X86::VMOVDQAYmr: + case X86::VMOVUPSZmr: + case X86::VMOVAPSZmr: + case X86::MMX_MOVD64mr: + case X86::MMX_MOVQ64mr: + case X86::MMX_MOVNTQmr: + return true; + } + return false; +} + +unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + if (isFrameLoadOpcode(MI->getOpcode())) + if (MI->getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex)) + return MI->getOperand(0).getReg(); + return 0; +} + +unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { + if (isFrameLoadOpcode(MI->getOpcode())) { + unsigned Reg; + if ((Reg = isLoadFromStackSlot(MI, FrameIndex))) + return Reg; + // Check for post-frame index elimination operations + const MachineMemOperand *Dummy; + return hasLoadFromStackSlot(MI, Dummy, FrameIndex); + } + return 0; +} + +unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + if (isFrameStoreOpcode(MI->getOpcode())) + if (MI->getOperand(X86::AddrNumOperands).getSubReg() == 0 && + isFrameOperand(MI, 0, FrameIndex)) + return MI->getOperand(X86::AddrNumOperands).getReg(); + return 0; +} + +unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { + if (isFrameStoreOpcode(MI->getOpcode())) { + unsigned Reg; + if ((Reg = isStoreToStackSlot(MI, FrameIndex))) + return Reg; + // Check for post-frame index elimination operations + const MachineMemOperand *Dummy; + return hasStoreToStackSlot(MI, Dummy, FrameIndex); + } + return 0; +} + +/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r. +static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) { + // Don't waste compile time scanning use-def chains of physregs. + if (!TargetRegisterInfo::isVirtualRegister(BaseReg)) + return false; + bool isPICBase = false; + for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg), + E = MRI.def_instr_end(); I != E; ++I) { + MachineInstr *DefMI = &*I; + if (DefMI->getOpcode() != X86::MOVPC32r) + return false; + assert(!isPICBase && "More than one PIC base?"); + isPICBase = true; + } + return isPICBase; +} + +bool +X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, + AliasAnalysis *AA) const { + switch (MI->getOpcode()) { + default: break; + case X86::MOV8rm: + case X86::MOV16rm: + case X86::MOV32rm: + case X86::MOV64rm: + case X86::LD_Fp64m: + case X86::MOVSSrm: + case X86::MOVSDrm: + case X86::MOVAPSrm: + case X86::MOVUPSrm: + case X86::MOVAPDrm: + case X86::MOVDQArm: + case X86::MOVDQUrm: + case X86::VMOVSSrm: + case X86::VMOVSDrm: + case X86::VMOVAPSrm: + case X86::VMOVUPSrm: + case X86::VMOVAPDrm: + case X86::VMOVDQArm: + case X86::VMOVDQUrm: + case X86::VMOVAPSYrm: + case X86::VMOVUPSYrm: + case X86::VMOVAPDYrm: + case X86::VMOVDQAYrm: + case X86::VMOVDQUYrm: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + case X86::FsVMOVAPSrm: + case X86::FsVMOVAPDrm: + case X86::FsMOVAPSrm: + case X86::FsMOVAPDrm: + // AVX-512 + case X86::VMOVAPDZ128rm: + case X86::VMOVAPDZ256rm: + case X86::VMOVAPDZrm: + case X86::VMOVAPSZ128rm: + case X86::VMOVAPSZ256rm: + case X86::VMOVAPSZrm: + case X86::VMOVDQA32Z128rm: + case X86::VMOVDQA32Z256rm: + case X86::VMOVDQA32Zrm: + case X86::VMOVDQA64Z128rm: + case X86::VMOVDQA64Z256rm: + case X86::VMOVDQA64Zrm: + case X86::VMOVDQU16Z128rm: + case X86::VMOVDQU16Z256rm: + case X86::VMOVDQU16Zrm: + case X86::VMOVDQU32Z128rm: + case X86::VMOVDQU32Z256rm: + case X86::VMOVDQU32Zrm: + case X86::VMOVDQU64Z128rm: + case X86::VMOVDQU64Z256rm: + case X86::VMOVDQU64Zrm: + case X86::VMOVDQU8Z128rm: + case X86::VMOVDQU8Z256rm: + case X86::VMOVDQU8Zrm: + case X86::VMOVUPSZ128rm: + case X86::VMOVUPSZ256rm: + case X86::VMOVUPSZrm: { + // Loads from constant pools are trivially rematerializable. + if (MI->getOperand(1+X86::AddrBaseReg).isReg() && + MI->getOperand(1+X86::AddrScaleAmt).isImm() && + MI->getOperand(1+X86::AddrIndexReg).isReg() && + MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 && + MI->isInvariantLoad(AA)) { + unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg(); + if (BaseReg == 0 || BaseReg == X86::RIP) + return true; + // Allow re-materialization of PIC load. + if (!ReMatPICStubLoad && MI->getOperand(1+X86::AddrDisp).isGlobal()) + return false; + const MachineFunction &MF = *MI->getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + return regIsPICBase(BaseReg, MRI); + } + return false; + } + + case X86::LEA32r: + case X86::LEA64r: { + if (MI->getOperand(1+X86::AddrScaleAmt).isImm() && + MI->getOperand(1+X86::AddrIndexReg).isReg() && + MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 && + !MI->getOperand(1+X86::AddrDisp).isReg()) { + // lea fi#, lea GV, etc. are all rematerializable. + if (!MI->getOperand(1+X86::AddrBaseReg).isReg()) + return true; + unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg(); + if (BaseReg == 0) + return true; + // Allow re-materialization of lea PICBase + x. + const MachineFunction &MF = *MI->getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + return regIsPICBase(BaseReg, MRI); + } + return false; + } + } + + // All other instructions marked M_REMATERIALIZABLE are always trivially + // rematerializable. + return true; +} + +bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + MachineBasicBlock::iterator E = MBB.end(); + + // For compile time consideration, if we are not able to determine the + // safety after visiting 4 instructions in each direction, we will assume + // it's not safe. + MachineBasicBlock::iterator Iter = I; + for (unsigned i = 0; Iter != E && i < 4; ++i) { + bool SeenDef = false; + for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) { + MachineOperand &MO = Iter->getOperand(j); + if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS)) + SeenDef = true; + if (!MO.isReg()) + continue; + if (MO.getReg() == X86::EFLAGS) { + if (MO.isUse()) + return false; + SeenDef = true; + } + } + + if (SeenDef) + // This instruction defines EFLAGS, no need to look any further. + return true; + ++Iter; + // Skip over DBG_VALUE. + while (Iter != E && Iter->isDebugValue()) + ++Iter; + } + + // It is safe to clobber EFLAGS at the end of a block of no successor has it + // live in. + if (Iter == E) { + for (MachineBasicBlock *S : MBB.successors()) + if (S->isLiveIn(X86::EFLAGS)) + return false; + return true; + } + + MachineBasicBlock::iterator B = MBB.begin(); + Iter = I; + for (unsigned i = 0; i < 4; ++i) { + // If we make it to the beginning of the block, it's safe to clobber + // EFLAGS iff EFLAGS is not live-in. + if (Iter == B) + return !MBB.isLiveIn(X86::EFLAGS); + + --Iter; + // Skip over DBG_VALUE. + while (Iter != B && Iter->isDebugValue()) + --Iter; + + bool SawKill = false; + for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) { + MachineOperand &MO = Iter->getOperand(j); + // A register mask may clobber EFLAGS, but we should still look for a + // live EFLAGS def. + if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS)) + SawKill = true; + if (MO.isReg() && MO.getReg() == X86::EFLAGS) { + if (MO.isDef()) return MO.isDead(); + if (MO.isKill()) SawKill = true; + } + } + + if (SawKill) + // This instruction kills EFLAGS and doesn't redefine it, so + // there's no need to look further. + return true; + } + + // Conservative answer. + return false; +} + +void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SubIdx, + const MachineInstr *Orig, + const TargetRegisterInfo &TRI) const { + bool ClobbersEFLAGS = false; + for (const MachineOperand &MO : Orig->operands()) { + if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) { + ClobbersEFLAGS = true; + break; + } + } + + if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) { + // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side + // effects. + int Value; + switch (Orig->getOpcode()) { + case X86::MOV32r0: Value = 0; break; + case X86::MOV32r1: Value = 1; break; + case X86::MOV32r_1: Value = -1; break; + default: + llvm_unreachable("Unexpected instruction!"); + } + + DebugLoc DL = Orig->getDebugLoc(); + BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0)) + .addImm(Value); + } else { + MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig); + MBB.insert(I, MI); + } + + MachineInstr *NewMI = std::prev(I); + NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI); +} + +/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead. +bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr *MI) const { + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.isDef() && + MO.getReg() == X86::EFLAGS && !MO.isDead()) { + return true; + } + } + return false; +} + +/// Check whether the shift count for a machine operand is non-zero. +inline static unsigned getTruncatedShiftCount(MachineInstr *MI, + unsigned ShiftAmtOperandIdx) { + // The shift count is six bits with the REX.W prefix and five bits without. + unsigned ShiftCountMask = (MI->getDesc().TSFlags & X86II::REX_W) ? 63 : 31; + unsigned Imm = MI->getOperand(ShiftAmtOperandIdx).getImm(); + return Imm & ShiftCountMask; +} + +/// Check whether the given shift count is appropriate +/// can be represented by a LEA instruction. +inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { + // Left shift instructions can be transformed into load-effective-address + // instructions if we can encode them appropriately. + // A LEA instruction utilizes a SIB byte to encode its scale factor. + // The SIB.scale field is two bits wide which means that we can encode any + // shift amount less than 4. + return ShAmt < 4 && ShAmt > 0; +} + +bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src, + unsigned Opc, bool AllowSP, + unsigned &NewSrc, bool &isKill, bool &isUndef, + MachineOperand &ImplicitOp) const { + MachineFunction &MF = *MI->getParent()->getParent(); + const TargetRegisterClass *RC; + if (AllowSP) { + RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass; + } else { + RC = Opc != X86::LEA32r ? + &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass; + } + unsigned SrcReg = Src.getReg(); + + // For both LEA64 and LEA32 the register already has essentially the right + // type (32-bit or 64-bit) we may just need to forbid SP. + if (Opc != X86::LEA64_32r) { + NewSrc = SrcReg; + isKill = Src.isKill(); + isUndef = Src.isUndef(); + + if (TargetRegisterInfo::isVirtualRegister(NewSrc) && + !MF.getRegInfo().constrainRegClass(NewSrc, RC)) + return false; + + return true; + } + + // This is for an LEA64_32r and incoming registers are 32-bit. One way or + // another we need to add 64-bit registers to the final MI. + if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) { + ImplicitOp = Src; + ImplicitOp.setImplicit(); + + NewSrc = getX86SubSuperRegister(Src.getReg(), 64); + MachineBasicBlock::LivenessQueryResult LQR = + MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI); + + switch (LQR) { + case MachineBasicBlock::LQR_Unknown: + // We can't give sane liveness flags to the instruction, abandon LEA + // formation. + return false; + case MachineBasicBlock::LQR_Live: + isKill = MI->killsRegister(SrcReg); + isUndef = false; + break; + default: + // The physreg itself is dead, so we have to use it as an <undef>. + isKill = false; + isUndef = true; + break; + } + } else { + // Virtual register of the wrong class, we have to create a temporary 64-bit + // vreg to feed into the LEA. + NewSrc = MF.getRegInfo().createVirtualRegister(RC); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + get(TargetOpcode::COPY)) + .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit) + .addOperand(Src); + + // Which is obviously going to be dead after we're done with it. + isKill = true; + isUndef = false; + } + + // We've set all the parameters without issue. + return true; +} + +/// Helper for convertToThreeAddress when 16-bit LEA is disabled, use 32-bit +/// LEA to form 3-address code by promoting to a 32-bit superregister and then +/// truncating back down to a 16-bit subregister. +MachineInstr * +X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, + MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const { + MachineInstr *MI = MBBI; + unsigned Dest = MI->getOperand(0).getReg(); + unsigned Src = MI->getOperand(1).getReg(); + bool isDead = MI->getOperand(0).isDead(); + bool isKill = MI->getOperand(1).isKill(); + + MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo(); + unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass); + unsigned Opc, leaInReg; + if (Subtarget.is64Bit()) { + Opc = X86::LEA64_32r; + leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); + } else { + Opc = X86::LEA32r; + leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); + } + + // Build and insert into an implicit UNDEF value. This is OK because + // well be shifting and then extracting the lower 16-bits. + // This has the potential to cause partial register stall. e.g. + // movw (%rbp,%rcx,2), %dx + // leal -65(%rdx), %esi + // But testing has shown this *does* help performance in 64-bit mode (at + // least on modern x86 machines). + BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg); + MachineInstr *InsMI = + BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY)) + .addReg(leaInReg, RegState::Define, X86::sub_16bit) + .addReg(Src, getKillRegState(isKill)); + + MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(), + get(Opc), leaOutReg); + switch (MIOpc) { + default: llvm_unreachable("Unreachable!"); + case X86::SHL16ri: { + unsigned ShAmt = MI->getOperand(2).getImm(); + MIB.addReg(0).addImm(1 << ShAmt) + .addReg(leaInReg, RegState::Kill).addImm(0).addReg(0); + break; + } + case X86::INC16r: + addRegOffset(MIB, leaInReg, true, 1); + break; + case X86::DEC16r: + addRegOffset(MIB, leaInReg, true, -1); + break; + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri_DB: + case X86::ADD16ri8_DB: + addRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm()); + break; + case X86::ADD16rr: + case X86::ADD16rr_DB: { + unsigned Src2 = MI->getOperand(2).getReg(); + bool isKill2 = MI->getOperand(2).isKill(); + unsigned leaInReg2 = 0; + MachineInstr *InsMI2 = nullptr; + if (Src == Src2) { + // ADD16rr %reg1028<kill>, %reg1028 + // just a single insert_subreg. + addRegReg(MIB, leaInReg, true, leaInReg, false); + } else { + if (Subtarget.is64Bit()) + leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); + else + leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); + // Build and insert into an implicit UNDEF value. This is OK because + // well be shifting and then extracting the lower 16-bits. + BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2); + InsMI2 = + BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(TargetOpcode::COPY)) + .addReg(leaInReg2, RegState::Define, X86::sub_16bit) + .addReg(Src2, getKillRegState(isKill2)); + addRegReg(MIB, leaInReg, true, leaInReg2, true); + } + if (LV && isKill2 && InsMI2) + LV->replaceKillInstruction(Src2, MI, InsMI2); + break; + } + } + + MachineInstr *NewMI = MIB; + MachineInstr *ExtMI = + BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY)) + .addReg(Dest, RegState::Define | getDeadRegState(isDead)) + .addReg(leaOutReg, RegState::Kill, X86::sub_16bit); + + if (LV) { + // Update live variables + LV->getVarInfo(leaInReg).Kills.push_back(NewMI); + LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI); + if (isKill) + LV->replaceKillInstruction(Src, MI, InsMI); + if (isDead) + LV->replaceKillInstruction(Dest, MI, ExtMI); + } + + return ExtMI; +} + +/// This method must be implemented by targets that +/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target +/// may be able to convert a two-address instruction into a true +/// three-address instruction on demand. This allows the X86 target (for +/// example) to convert ADD and SHL instructions into LEA instructions if they +/// would require register copies due to two-addressness. +/// +/// This method returns a null pointer if the transformation cannot be +/// performed, otherwise it returns the new instruction. +/// +MachineInstr * +X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const { + MachineInstr *MI = MBBI; + + // The following opcodes also sets the condition code register(s). Only + // convert them to equivalent lea if the condition code register def's + // are dead! + if (hasLiveCondCodeDef(MI)) + return nullptr; + + MachineFunction &MF = *MI->getParent()->getParent(); + // All instructions input are two-addr instructions. Get the known operands. + const MachineOperand &Dest = MI->getOperand(0); + const MachineOperand &Src = MI->getOperand(1); + + MachineInstr *NewMI = nullptr; + // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When + // we have better subtarget support, enable the 16-bit LEA generation here. + // 16-bit LEA is also slow on Core2. + bool DisableLEA16 = true; + bool is64Bit = Subtarget.is64Bit(); + + unsigned MIOpc = MI->getOpcode(); + switch (MIOpc) { + default: return nullptr; + case X86::SHL64ri: { + assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); + unsigned ShAmt = getTruncatedShiftCount(MI, 2); + if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; + + // LEA can't handle RSP. + if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && + !MF.getRegInfo().constrainRegClass(Src.getReg(), + &X86::GR64_NOSPRegClass)) + return nullptr; + + NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) + .addOperand(Dest) + .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); + break; + } + case X86::SHL32ri: { + assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); + unsigned ShAmt = getTruncatedShiftCount(MI, 2); + if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; + + unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; + + // LEA can't handle ESP. + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, + SrcReg, isKill, isUndef, ImplicitOp)) + return nullptr; + + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(0).addImm(1 << ShAmt) + .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)) + .addImm(0).addReg(0); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); + NewMI = MIB; + + break; + } + case X86::SHL16ri: { + assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); + unsigned ShAmt = getTruncatedShiftCount(MI, 2); + if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; + + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : nullptr; + NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest) + .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); + break; + } + case X86::INC64r: + case X86::INC32r: { + assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); + unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r + : (is64Bit ? X86::LEA64_32r : X86::LEA32r); + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, + SrcReg, isKill, isUndef, ImplicitOp)) + return nullptr; + + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); + + NewMI = addOffset(MIB, 1); + break; + } + case X86::INC16r: + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; + assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest).addOperand(Src), 1); + break; + case X86::DEC64r: + case X86::DEC32r: { + assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); + unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r + : (is64Bit ? X86::LEA64_32r : X86::LEA32r); + + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, + SrcReg, isKill, isUndef, ImplicitOp)) + return nullptr; + + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); + + NewMI = addOffset(MIB, -1); + + break; + } + case X86::DEC16r: + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; + assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest).addOperand(Src), -1); + break; + case X86::ADD64rr: + case X86::ADD64rr_DB: + case X86::ADD32rr: + case X86::ADD32rr_DB: { + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + unsigned Opc; + if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) + Opc = X86::LEA64r; + else + Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; + + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, + SrcReg, isKill, isUndef, ImplicitOp)) + return nullptr; + + const MachineOperand &Src2 = MI->getOperand(2); + bool isKill2, isUndef2; + unsigned SrcReg2; + MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false, + SrcReg2, isKill2, isUndef2, ImplicitOp2)) + return nullptr; + + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); + if (ImplicitOp2.getReg() != 0) + MIB.addOperand(ImplicitOp2); + + NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2); + + // Preserve undefness of the operands. + NewMI->getOperand(1).setIsUndef(isUndef); + NewMI->getOperand(3).setIsUndef(isUndef2); + + if (LV && Src2.isKill()) + LV->replaceKillInstruction(SrcReg2, MI, NewMI); + break; + } + case X86::ADD16rr: + case X86::ADD16rr_DB: { + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + unsigned Src2 = MI->getOperand(2).getReg(); + bool isKill2 = MI->getOperand(2).isKill(); + NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest), + Src.getReg(), Src.isKill(), Src2, isKill2); + + // Preserve undefness of the operands. + bool isUndef = MI->getOperand(1).isUndef(); + bool isUndef2 = MI->getOperand(2).isUndef(); + NewMI->getOperand(1).setIsUndef(isUndef); + NewMI->getOperand(3).setIsUndef(isUndef2); + + if (LV && isKill2) + LV->replaceKillInstruction(Src2, MI, NewMI); + break; + } + case X86::ADD64ri32: + case X86::ADD64ri8: + case X86::ADD64ri32_DB: + case X86::ADD64ri8_DB: + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) + .addOperand(Dest).addOperand(Src), + MI->getOperand(2).getImm()); + break; + case X86::ADD32ri: + case X86::ADD32ri8: + case X86::ADD32ri_DB: + case X86::ADD32ri8_DB: { + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; + + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, + SrcReg, isKill, isUndef, ImplicitOp)) + return nullptr; + + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); + + NewMI = addOffset(MIB, MI->getOperand(2).getImm()); + break; + } + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri_DB: + case X86::ADD16ri8_DB: + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest).addOperand(Src), + MI->getOperand(2).getImm()); + break; + } + + if (!NewMI) return nullptr; + + if (LV) { // Update live variables + if (Src.isKill()) + LV->replaceKillInstruction(Src.getReg(), MI, NewMI); + if (Dest.isDead()) + LV->replaceKillInstruction(Dest.getReg(), MI, NewMI); + } + + MFI->insert(MBBI, NewMI); // Insert the new inst + return NewMI; +} + +/// Returns true if the given instruction opcode is FMA3. +/// Otherwise, returns false. +/// The second parameter is optional and is used as the second return from +/// the function. It is set to true if the given instruction has FMA3 opcode +/// that is used for lowering of scalar FMA intrinsics, and it is set to false +/// otherwise. +static bool isFMA3(unsigned Opcode, bool *IsIntrinsic = nullptr) { + if (IsIntrinsic) + *IsIntrinsic = false; + + switch (Opcode) { + case X86::VFMADDSDr132r: case X86::VFMADDSDr132m: + case X86::VFMADDSSr132r: case X86::VFMADDSSr132m: + case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m: + case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m: + case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m: + case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m: + case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m: + case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m: + + case X86::VFMADDSDr213r: case X86::VFMADDSDr213m: + case X86::VFMADDSSr213r: case X86::VFMADDSSr213m: + case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m: + case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m: + case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m: + case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m: + case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m: + case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m: + + case X86::VFMADDSDr231r: case X86::VFMADDSDr231m: + case X86::VFMADDSSr231r: case X86::VFMADDSSr231m: + case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m: + case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m: + case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m: + case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m: + case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m: + case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m: + + case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m: + case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m: + case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m: + case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m: + case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY: + case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY: + case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY: + case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY: + + case X86::VFMADDPDr132r: case X86::VFMADDPDr132m: + case X86::VFMADDPSr132r: case X86::VFMADDPSr132m: + case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m: + case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m: + case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m: + case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m: + case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m: + case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m: + case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY: + case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY: + case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY: + case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY: + case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY: + case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY: + case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY: + case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY: + + case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m: + case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m: + case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m: + case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m: + case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY: + case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY: + case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY: + case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY: + + case X86::VFMADDPDr213r: case X86::VFMADDPDr213m: + case X86::VFMADDPSr213r: case X86::VFMADDPSr213m: + case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m: + case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m: + case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m: + case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m: + case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m: + case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m: + case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY: + case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY: + case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY: + case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY: + case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY: + case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY: + case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY: + case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY: + + case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m: + case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m: + case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m: + case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m: + case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY: + case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY: + case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY: + case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY: + + case X86::VFMADDPDr231r: case X86::VFMADDPDr231m: + case X86::VFMADDPSr231r: case X86::VFMADDPSr231m: + case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m: + case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m: + case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m: + case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m: + case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m: + case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m: + case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY: + case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY: + case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY: + case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY: + case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY: + case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY: + case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY: + case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY: + return true; + + case X86::VFMADDSDr132r_Int: case X86::VFMADDSDr132m_Int: + case X86::VFMADDSSr132r_Int: case X86::VFMADDSSr132m_Int: + case X86::VFMSUBSDr132r_Int: case X86::VFMSUBSDr132m_Int: + case X86::VFMSUBSSr132r_Int: case X86::VFMSUBSSr132m_Int: + case X86::VFNMADDSDr132r_Int: case X86::VFNMADDSDr132m_Int: + case X86::VFNMADDSSr132r_Int: case X86::VFNMADDSSr132m_Int: + case X86::VFNMSUBSDr132r_Int: case X86::VFNMSUBSDr132m_Int: + case X86::VFNMSUBSSr132r_Int: case X86::VFNMSUBSSr132m_Int: + + case X86::VFMADDSDr213r_Int: case X86::VFMADDSDr213m_Int: + case X86::VFMADDSSr213r_Int: case X86::VFMADDSSr213m_Int: + case X86::VFMSUBSDr213r_Int: case X86::VFMSUBSDr213m_Int: + case X86::VFMSUBSSr213r_Int: case X86::VFMSUBSSr213m_Int: + case X86::VFNMADDSDr213r_Int: case X86::VFNMADDSDr213m_Int: + case X86::VFNMADDSSr213r_Int: case X86::VFNMADDSSr213m_Int: + case X86::VFNMSUBSDr213r_Int: case X86::VFNMSUBSDr213m_Int: + case X86::VFNMSUBSSr213r_Int: case X86::VFNMSUBSSr213m_Int: + + case X86::VFMADDSDr231r_Int: case X86::VFMADDSDr231m_Int: + case X86::VFMADDSSr231r_Int: case X86::VFMADDSSr231m_Int: + case X86::VFMSUBSDr231r_Int: case X86::VFMSUBSDr231m_Int: + case X86::VFMSUBSSr231r_Int: case X86::VFMSUBSSr231m_Int: + case X86::VFNMADDSDr231r_Int: case X86::VFNMADDSDr231m_Int: + case X86::VFNMADDSSr231r_Int: case X86::VFNMADDSSr231m_Int: + case X86::VFNMSUBSDr231r_Int: case X86::VFNMSUBSDr231m_Int: + case X86::VFNMSUBSSr231r_Int: case X86::VFNMSUBSSr231m_Int: + if (IsIntrinsic) + *IsIntrinsic = true; + return true; + default: + return false; + } + llvm_unreachable("Opcode not handled by the switch"); +} + +MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const { + switch (MI->getOpcode()) { + case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I) + case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I) + case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I) + case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I) + case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I) + case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I) + unsigned Opc; + unsigned Size; + switch (MI->getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break; + case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break; + case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break; + case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break; + case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break; + case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break; + } + unsigned Amt = MI->getOperand(3).getImm(); + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->setDesc(get(Opc)); + MI->getOperand(3).setImm(Size-Amt); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + } + case X86::BLENDPDrri: + case X86::BLENDPSrri: + case X86::PBLENDWrri: + case X86::VBLENDPDrri: + case X86::VBLENDPSrri: + case X86::VBLENDPDYrri: + case X86::VBLENDPSYrri: + case X86::VPBLENDDrri: + case X86::VPBLENDWrri: + case X86::VPBLENDDYrri: + case X86::VPBLENDWYrri:{ + unsigned Mask; + switch (MI->getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::BLENDPDrri: Mask = 0x03; break; + case X86::BLENDPSrri: Mask = 0x0F; break; + case X86::PBLENDWrri: Mask = 0xFF; break; + case X86::VBLENDPDrri: Mask = 0x03; break; + case X86::VBLENDPSrri: Mask = 0x0F; break; + case X86::VBLENDPDYrri: Mask = 0x0F; break; + case X86::VBLENDPSYrri: Mask = 0xFF; break; + case X86::VPBLENDDrri: Mask = 0x0F; break; + case X86::VPBLENDWrri: Mask = 0xFF; break; + case X86::VPBLENDDYrri: Mask = 0xFF; break; + case X86::VPBLENDWYrri: Mask = 0xFF; break; + } + // Only the least significant bits of Imm are used. + unsigned Imm = MI->getOperand(3).getImm() & Mask; + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->getOperand(3).setImm(Mask ^ Imm); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + } + case X86::PCLMULQDQrr: + case X86::VPCLMULQDQrr:{ + // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0] + // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0] + unsigned Imm = MI->getOperand(3).getImm(); + unsigned Src1Hi = Imm & 0x01; + unsigned Src2Hi = Imm & 0x10; + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4)); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + } + case X86::CMPPDrri: + case X86::CMPPSrri: + case X86::VCMPPDrri: + case X86::VCMPPSrri: + case X86::VCMPPDYrri: + case X86::VCMPPSYrri: { + // Float comparison can be safely commuted for + // Ordered/Unordered/Equal/NotEqual tests + unsigned Imm = MI->getOperand(3).getImm() & 0x7; + switch (Imm) { + case 0x00: // EQUAL + case 0x03: // UNORDERED + case 0x04: // NOT EQUAL + case 0x07: // ORDERED + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + default: + return nullptr; + } + } + case X86::VPCOMBri: case X86::VPCOMUBri: + case X86::VPCOMDri: case X86::VPCOMUDri: + case X86::VPCOMQri: case X86::VPCOMUQri: + case X86::VPCOMWri: case X86::VPCOMUWri: { + // Flip comparison mode immediate (if necessary). + unsigned Imm = MI->getOperand(3).getImm() & 0x7; + switch (Imm) { + case 0x00: Imm = 0x02; break; // LT -> GT + case 0x01: Imm = 0x03; break; // LE -> GE + case 0x02: Imm = 0x00; break; // GT -> LT + case 0x03: Imm = 0x01; break; // GE -> LE + case 0x04: // EQ + case 0x05: // NE + case 0x06: // FALSE + case 0x07: // TRUE + default: + break; + } + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->getOperand(3).setImm(Imm); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + } + case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: + case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: + case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr: + case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr: + case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr: + case X86::CMOVA16rr: case X86::CMOVA32rr: case X86::CMOVA64rr: + case X86::CMOVL16rr: case X86::CMOVL32rr: case X86::CMOVL64rr: + case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr: + case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr: + case X86::CMOVG16rr: case X86::CMOVG32rr: case X86::CMOVG64rr: + case X86::CMOVS16rr: case X86::CMOVS32rr: case X86::CMOVS64rr: + case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr: + case X86::CMOVP16rr: case X86::CMOVP32rr: case X86::CMOVP64rr: + case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr: + case X86::CMOVO16rr: case X86::CMOVO32rr: case X86::CMOVO64rr: + case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: { + unsigned Opc; + switch (MI->getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::CMOVB16rr: Opc = X86::CMOVAE16rr; break; + case X86::CMOVB32rr: Opc = X86::CMOVAE32rr; break; + case X86::CMOVB64rr: Opc = X86::CMOVAE64rr; break; + case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break; + case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break; + case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break; + case X86::CMOVE16rr: Opc = X86::CMOVNE16rr; break; + case X86::CMOVE32rr: Opc = X86::CMOVNE32rr; break; + case X86::CMOVE64rr: Opc = X86::CMOVNE64rr; break; + case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break; + case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break; + case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break; + case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break; + case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break; + case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break; + case X86::CMOVA16rr: Opc = X86::CMOVBE16rr; break; + case X86::CMOVA32rr: Opc = X86::CMOVBE32rr; break; + case X86::CMOVA64rr: Opc = X86::CMOVBE64rr; break; + case X86::CMOVL16rr: Opc = X86::CMOVGE16rr; break; + case X86::CMOVL32rr: Opc = X86::CMOVGE32rr; break; + case X86::CMOVL64rr: Opc = X86::CMOVGE64rr; break; + case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break; + case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break; + case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break; + case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break; + case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break; + case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break; + case X86::CMOVG16rr: Opc = X86::CMOVLE16rr; break; + case X86::CMOVG32rr: Opc = X86::CMOVLE32rr; break; + case X86::CMOVG64rr: Opc = X86::CMOVLE64rr; break; + case X86::CMOVS16rr: Opc = X86::CMOVNS16rr; break; + case X86::CMOVS32rr: Opc = X86::CMOVNS32rr; break; + case X86::CMOVS64rr: Opc = X86::CMOVNS64rr; break; + case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break; + case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break; + case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break; + case X86::CMOVP16rr: Opc = X86::CMOVNP16rr; break; + case X86::CMOVP32rr: Opc = X86::CMOVNP32rr; break; + case X86::CMOVP64rr: Opc = X86::CMOVNP64rr; break; + case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break; + case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break; + case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break; + case X86::CMOVO16rr: Opc = X86::CMOVNO16rr; break; + case X86::CMOVO32rr: Opc = X86::CMOVNO32rr; break; + case X86::CMOVO64rr: Opc = X86::CMOVNO64rr; break; + case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break; + case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break; + case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break; + } + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->setDesc(get(Opc)); + // Fallthrough intended. + } + default: + if (isFMA3(MI->getOpcode())) { + unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2); + if (Opc == 0) + return nullptr; + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->setDesc(get(Opc)); + } + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + } +} + +bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const { + + unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3; + + // Only the first RegOpsNum operands are commutable. + // Also, the value 'CommuteAnyOperandIndex' is valid here as it means + // that the operand is not specified/fixed. + if (SrcOpIdx1 != CommuteAnyOperandIndex && + (SrcOpIdx1 < 1 || SrcOpIdx1 > RegOpsNum)) + return false; + if (SrcOpIdx2 != CommuteAnyOperandIndex && + (SrcOpIdx2 < 1 || SrcOpIdx2 > RegOpsNum)) + return false; + + // Look for two different register operands assumed to be commutable + // regardless of the FMA opcode. The FMA opcode is adjusted later. + if (SrcOpIdx1 == CommuteAnyOperandIndex || + SrcOpIdx2 == CommuteAnyOperandIndex) { + unsigned CommutableOpIdx1 = SrcOpIdx1; + unsigned CommutableOpIdx2 = SrcOpIdx2; + + // At least one of operands to be commuted is not specified and + // this method is free to choose appropriate commutable operands. + if (SrcOpIdx1 == SrcOpIdx2) + // Both of operands are not fixed. By default set one of commutable + // operands to the last register operand of the instruction. + CommutableOpIdx2 = RegOpsNum; + else if (SrcOpIdx2 == CommuteAnyOperandIndex) + // Only one of operands is not fixed. + CommutableOpIdx2 = SrcOpIdx1; + + // CommutableOpIdx2 is well defined now. Let's choose another commutable + // operand and assign its index to CommutableOpIdx1. + unsigned Op2Reg = MI->getOperand(CommutableOpIdx2).getReg(); + for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) { + // The commuted operands must have different registers. + // Otherwise, the commute transformation does not change anything and + // is useless then. + if (Op2Reg != MI->getOperand(CommutableOpIdx1).getReg()) + break; + } + + // No appropriate commutable operands were found. + if (CommutableOpIdx1 == 0) + return false; + + // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2 + // to return those values. + if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, + CommutableOpIdx1, CommutableOpIdx2)) + return false; + } + + // Check if we can adjust the opcode to preserve the semantics when + // commute the register operands. + return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0; +} + +unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI, + unsigned SrcOpIdx1, + unsigned SrcOpIdx2) const { + unsigned Opc = MI->getOpcode(); + + // Define the array that holds FMA opcodes in groups + // of 3 opcodes(132, 213, 231) in each group. + static const unsigned RegularOpcodeGroups[][3] = { + { X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r }, + { X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r }, + { X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r }, + { X86::VFMADDPDr132r, X86::VFMADDPDr213r, X86::VFMADDPDr231r }, + { X86::VFMADDPSr132rY, X86::VFMADDPSr213rY, X86::VFMADDPSr231rY }, + { X86::VFMADDPDr132rY, X86::VFMADDPDr213rY, X86::VFMADDPDr231rY }, + { X86::VFMADDSSr132m, X86::VFMADDSSr213m, X86::VFMADDSSr231m }, + { X86::VFMADDSDr132m, X86::VFMADDSDr213m, X86::VFMADDSDr231m }, + { X86::VFMADDPSr132m, X86::VFMADDPSr213m, X86::VFMADDPSr231m }, + { X86::VFMADDPDr132m, X86::VFMADDPDr213m, X86::VFMADDPDr231m }, + { X86::VFMADDPSr132mY, X86::VFMADDPSr213mY, X86::VFMADDPSr231mY }, + { X86::VFMADDPDr132mY, X86::VFMADDPDr213mY, X86::VFMADDPDr231mY }, + + { X86::VFMSUBSSr132r, X86::VFMSUBSSr213r, X86::VFMSUBSSr231r }, + { X86::VFMSUBSDr132r, X86::VFMSUBSDr213r, X86::VFMSUBSDr231r }, + { X86::VFMSUBPSr132r, X86::VFMSUBPSr213r, X86::VFMSUBPSr231r }, + { X86::VFMSUBPDr132r, X86::VFMSUBPDr213r, X86::VFMSUBPDr231r }, + { X86::VFMSUBPSr132rY, X86::VFMSUBPSr213rY, X86::VFMSUBPSr231rY }, + { X86::VFMSUBPDr132rY, X86::VFMSUBPDr213rY, X86::VFMSUBPDr231rY }, + { X86::VFMSUBSSr132m, X86::VFMSUBSSr213m, X86::VFMSUBSSr231m }, + { X86::VFMSUBSDr132m, X86::VFMSUBSDr213m, X86::VFMSUBSDr231m }, + { X86::VFMSUBPSr132m, X86::VFMSUBPSr213m, X86::VFMSUBPSr231m }, + { X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m }, + { X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY }, + { X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY }, + + { X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r }, + { X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r }, + { X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r }, + { X86::VFNMADDPDr132r, X86::VFNMADDPDr213r, X86::VFNMADDPDr231r }, + { X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY }, + { X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY }, + { X86::VFNMADDSSr132m, X86::VFNMADDSSr213m, X86::VFNMADDSSr231m }, + { X86::VFNMADDSDr132m, X86::VFNMADDSDr213m, X86::VFNMADDSDr231m }, + { X86::VFNMADDPSr132m, X86::VFNMADDPSr213m, X86::VFNMADDPSr231m }, + { X86::VFNMADDPDr132m, X86::VFNMADDPDr213m, X86::VFNMADDPDr231m }, + { X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY }, + { X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY }, + + { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr213r, X86::VFNMSUBSSr231r }, + { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr213r, X86::VFNMSUBSDr231r }, + { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr213r, X86::VFNMSUBPSr231r }, + { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr213r, X86::VFNMSUBPDr231r }, + { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY }, + { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY }, + { X86::VFNMSUBSSr132m, X86::VFNMSUBSSr213m, X86::VFNMSUBSSr231m }, + { X86::VFNMSUBSDr132m, X86::VFNMSUBSDr213m, X86::VFNMSUBSDr231m }, + { X86::VFNMSUBPSr132m, X86::VFNMSUBPSr213m, X86::VFNMSUBPSr231m }, + { X86::VFNMSUBPDr132m, X86::VFNMSUBPDr213m, X86::VFNMSUBPDr231m }, + { X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY }, + { X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY }, + + { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr231r }, + { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr231r }, + { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY }, + { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY }, + { X86::VFMADDSUBPSr132m, X86::VFMADDSUBPSr213m, X86::VFMADDSUBPSr231m }, + { X86::VFMADDSUBPDr132m, X86::VFMADDSUBPDr213m, X86::VFMADDSUBPDr231m }, + { X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY }, + { X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY }, + + { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr231r }, + { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr231r }, + { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY }, + { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY }, + { X86::VFMSUBADDPSr132m, X86::VFMSUBADDPSr213m, X86::VFMSUBADDPSr231m }, + { X86::VFMSUBADDPDr132m, X86::VFMSUBADDPDr213m, X86::VFMSUBADDPDr231m }, + { X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY }, + { X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY } + }; + + // Define the array that holds FMA*_Int opcodes in groups + // of 3 opcodes(132, 213, 231) in each group. + static const unsigned IntrinOpcodeGroups[][3] = { + { X86::VFMADDSSr132r_Int, X86::VFMADDSSr213r_Int, X86::VFMADDSSr231r_Int }, + { X86::VFMADDSDr132r_Int, X86::VFMADDSDr213r_Int, X86::VFMADDSDr231r_Int }, + { X86::VFMADDSSr132m_Int, X86::VFMADDSSr213m_Int, X86::VFMADDSSr231m_Int }, + { X86::VFMADDSDr132m_Int, X86::VFMADDSDr213m_Int, X86::VFMADDSDr231m_Int }, + + { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr231r_Int }, + { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr231r_Int }, + { X86::VFMSUBSSr132m_Int, X86::VFMSUBSSr213m_Int, X86::VFMSUBSSr231m_Int }, + { X86::VFMSUBSDr132m_Int, X86::VFMSUBSDr213m_Int, X86::VFMSUBSDr231m_Int }, + + { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr231r_Int }, + { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr231r_Int }, + { X86::VFNMADDSSr132m_Int, X86::VFNMADDSSr213m_Int, X86::VFNMADDSSr231m_Int }, + { X86::VFNMADDSDr132m_Int, X86::VFNMADDSDr213m_Int, X86::VFNMADDSDr231m_Int }, + + { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr231r_Int }, + { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr231r_Int }, + { X86::VFNMSUBSSr132m_Int, X86::VFNMSUBSSr213m_Int, X86::VFNMSUBSSr231m_Int }, + { X86::VFNMSUBSDr132m_Int, X86::VFNMSUBSDr213m_Int, X86::VFNMSUBSDr231m_Int }, + }; + + const unsigned Form132Index = 0; + const unsigned Form213Index = 1; + const unsigned Form231Index = 2; + const unsigned FormsNum = 3; + + bool IsIntrinOpcode; + isFMA3(Opc, &IsIntrinOpcode); + + size_t GroupsNum; + const unsigned (*OpcodeGroups)[3]; + if (IsIntrinOpcode) { + GroupsNum = array_lengthof(IntrinOpcodeGroups); + OpcodeGroups = IntrinOpcodeGroups; + } else { + GroupsNum = array_lengthof(RegularOpcodeGroups); + OpcodeGroups = RegularOpcodeGroups; + } + + const unsigned *FoundOpcodesGroup = nullptr; + size_t FormIndex; + + // Look for the input opcode in the corresponding opcodes table. + for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup; + ++GroupIndex) { + for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) { + if (OpcodeGroups[GroupIndex][FormIndex] == Opc) { + FoundOpcodesGroup = OpcodeGroups[GroupIndex]; + break; + } + } + } + + // The input opcode does not match with any of the opcodes from the tables. + // The unsupported FMA opcode must be added to one of the two opcode groups + // defined above. + assert(FoundOpcodesGroup != nullptr && "Unexpected FMA3 opcode"); + + // Put the lowest index to SrcOpIdx1 to simplify the checks below. + if (SrcOpIdx1 > SrcOpIdx2) + std::swap(SrcOpIdx1, SrcOpIdx2); + + // TODO: Commuting the 1st operand of FMA*_Int requires some additional + // analysis. The commute optimization is legal only if all users of FMA*_Int + // use only the lowest element of the FMA*_Int instruction. Such analysis are + // not implemented yet. So, just return 0 in that case. + // When such analysis are available this place will be the right place for + // calling it. + if (IsIntrinOpcode && SrcOpIdx1 == 1) + return 0; + + unsigned Case; + if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2) + Case = 0; + else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3) + Case = 1; + else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3) + Case = 2; + else + return 0; + + // Define the FMA forms mapping array that helps to map input FMA form + // to output FMA form to preserve the operation semantics after + // commuting the operands. + static const unsigned FormMapping[][3] = { + // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2; + // FMA132 A, C, b; ==> FMA231 C, A, b; + // FMA213 B, A, c; ==> FMA213 A, B, c; + // FMA231 C, A, b; ==> FMA132 A, C, b; + { Form231Index, Form213Index, Form132Index }, + // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3; + // FMA132 A, c, B; ==> FMA132 B, c, A; + // FMA213 B, a, C; ==> FMA231 C, a, B; + // FMA231 C, a, B; ==> FMA213 B, a, C; + { Form132Index, Form231Index, Form213Index }, + // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3; + // FMA132 a, C, B; ==> FMA213 a, B, C; + // FMA213 b, A, C; ==> FMA132 b, C, A; + // FMA231 c, A, B; ==> FMA231 c, B, A; + { Form213Index, Form132Index, Form231Index } + }; + + // Everything is ready, just adjust the FMA opcode and return it. + FormIndex = FormMapping[Case][FormIndex]; + return FoundOpcodesGroup[FormIndex]; +} + +bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const { + switch (MI->getOpcode()) { + case X86::CMPPDrri: + case X86::CMPPSrri: + case X86::VCMPPDrri: + case X86::VCMPPSrri: + case X86::VCMPPDYrri: + case X86::VCMPPSYrri: { + // Float comparison can be safely commuted for + // Ordered/Unordered/Equal/NotEqual tests + unsigned Imm = MI->getOperand(3).getImm() & 0x7; + switch (Imm) { + case 0x00: // EQUAL + case 0x03: // UNORDERED + case 0x04: // NOT EQUAL + case 0x07: // ORDERED + // The indices of the commutable operands are 1 and 2. + // Assign them to the returned operand indices here. + return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2); + } + return false; + } + default: + if (isFMA3(MI->getOpcode())) + return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + } + return false; +} + +static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) { + switch (BrOpc) { + default: return X86::COND_INVALID; + case X86::JE_1: return X86::COND_E; + case X86::JNE_1: return X86::COND_NE; + case X86::JL_1: return X86::COND_L; + case X86::JLE_1: return X86::COND_LE; + case X86::JG_1: return X86::COND_G; + case X86::JGE_1: return X86::COND_GE; + case X86::JB_1: return X86::COND_B; + case X86::JBE_1: return X86::COND_BE; + case X86::JA_1: return X86::COND_A; + case X86::JAE_1: return X86::COND_AE; + case X86::JS_1: return X86::COND_S; + case X86::JNS_1: return X86::COND_NS; + case X86::JP_1: return X86::COND_P; + case X86::JNP_1: return X86::COND_NP; + case X86::JO_1: return X86::COND_O; + case X86::JNO_1: return X86::COND_NO; + } +} + +/// Return condition code of a SET opcode. +static X86::CondCode getCondFromSETOpc(unsigned Opc) { + switch (Opc) { + default: return X86::COND_INVALID; + case X86::SETAr: case X86::SETAm: return X86::COND_A; + case X86::SETAEr: case X86::SETAEm: return X86::COND_AE; + case X86::SETBr: case X86::SETBm: return X86::COND_B; + case X86::SETBEr: case X86::SETBEm: return X86::COND_BE; + case X86::SETEr: case X86::SETEm: return X86::COND_E; + case X86::SETGr: case X86::SETGm: return X86::COND_G; + case X86::SETGEr: case X86::SETGEm: return X86::COND_GE; + case X86::SETLr: case X86::SETLm: return X86::COND_L; + case X86::SETLEr: case X86::SETLEm: return X86::COND_LE; + case X86::SETNEr: case X86::SETNEm: return X86::COND_NE; + case X86::SETNOr: case X86::SETNOm: return X86::COND_NO; + case X86::SETNPr: case X86::SETNPm: return X86::COND_NP; + case X86::SETNSr: case X86::SETNSm: return X86::COND_NS; + case X86::SETOr: case X86::SETOm: return X86::COND_O; + case X86::SETPr: case X86::SETPm: return X86::COND_P; + case X86::SETSr: case X86::SETSm: return X86::COND_S; + } +} + +/// Return condition code of a CMov opcode. +X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) { + switch (Opc) { + default: return X86::COND_INVALID; + case X86::CMOVA16rm: case X86::CMOVA16rr: case X86::CMOVA32rm: + case X86::CMOVA32rr: case X86::CMOVA64rm: case X86::CMOVA64rr: + return X86::COND_A; + case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm: + case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr: + return X86::COND_AE; + case X86::CMOVB16rm: case X86::CMOVB16rr: case X86::CMOVB32rm: + case X86::CMOVB32rr: case X86::CMOVB64rm: case X86::CMOVB64rr: + return X86::COND_B; + case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm: + case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr: + return X86::COND_BE; + case X86::CMOVE16rm: case X86::CMOVE16rr: case X86::CMOVE32rm: + case X86::CMOVE32rr: case X86::CMOVE64rm: case X86::CMOVE64rr: + return X86::COND_E; + case X86::CMOVG16rm: case X86::CMOVG16rr: case X86::CMOVG32rm: + case X86::CMOVG32rr: case X86::CMOVG64rm: case X86::CMOVG64rr: + return X86::COND_G; + case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm: + case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr: + return X86::COND_GE; + case X86::CMOVL16rm: case X86::CMOVL16rr: case X86::CMOVL32rm: + case X86::CMOVL32rr: case X86::CMOVL64rm: case X86::CMOVL64rr: + return X86::COND_L; + case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm: + case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr: + return X86::COND_LE; + case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm: + case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr: + return X86::COND_NE; + case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm: + case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr: + return X86::COND_NO; + case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm: + case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr: + return X86::COND_NP; + case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm: + case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr: + return X86::COND_NS; + case X86::CMOVO16rm: case X86::CMOVO16rr: case X86::CMOVO32rm: + case X86::CMOVO32rr: case X86::CMOVO64rm: case X86::CMOVO64rr: + return X86::COND_O; + case X86::CMOVP16rm: case X86::CMOVP16rr: case X86::CMOVP32rm: + case X86::CMOVP32rr: case X86::CMOVP64rm: case X86::CMOVP64rr: + return X86::COND_P; + case X86::CMOVS16rm: case X86::CMOVS16rr: case X86::CMOVS32rm: + case X86::CMOVS32rr: case X86::CMOVS64rm: case X86::CMOVS64rr: + return X86::COND_S; + } +} + +unsigned X86::GetCondBranchFromCond(X86::CondCode CC) { + switch (CC) { + default: llvm_unreachable("Illegal condition code!"); + case X86::COND_E: return X86::JE_1; + case X86::COND_NE: return X86::JNE_1; + case X86::COND_L: return X86::JL_1; + case X86::COND_LE: return X86::JLE_1; + case X86::COND_G: return X86::JG_1; + case X86::COND_GE: return X86::JGE_1; + case X86::COND_B: return X86::JB_1; + case X86::COND_BE: return X86::JBE_1; + case X86::COND_A: return X86::JA_1; + case X86::COND_AE: return X86::JAE_1; + case X86::COND_S: return X86::JS_1; + case X86::COND_NS: return X86::JNS_1; + case X86::COND_P: return X86::JP_1; + case X86::COND_NP: return X86::JNP_1; + case X86::COND_O: return X86::JO_1; + case X86::COND_NO: return X86::JNO_1; + } +} + +/// Return the inverse of the specified condition, +/// e.g. turning COND_E to COND_NE. +X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) { + switch (CC) { + default: llvm_unreachable("Illegal condition code!"); + case X86::COND_E: return X86::COND_NE; + case X86::COND_NE: return X86::COND_E; + case X86::COND_L: return X86::COND_GE; + case X86::COND_LE: return X86::COND_G; + case X86::COND_G: return X86::COND_LE; + case X86::COND_GE: return X86::COND_L; + case X86::COND_B: return X86::COND_AE; + case X86::COND_BE: return X86::COND_A; + case X86::COND_A: return X86::COND_BE; + case X86::COND_AE: return X86::COND_B; + case X86::COND_S: return X86::COND_NS; + case X86::COND_NS: return X86::COND_S; + case X86::COND_P: return X86::COND_NP; + case X86::COND_NP: return X86::COND_P; + case X86::COND_O: return X86::COND_NO; + case X86::COND_NO: return X86::COND_O; + } +} + +/// Assuming the flags are set by MI(a,b), return the condition code if we +/// modify the instructions such that flags are set by MI(b,a). +static X86::CondCode getSwappedCondition(X86::CondCode CC) { + switch (CC) { + default: return X86::COND_INVALID; + case X86::COND_E: return X86::COND_E; + case X86::COND_NE: return X86::COND_NE; + case X86::COND_L: return X86::COND_G; + case X86::COND_LE: return X86::COND_GE; + case X86::COND_G: return X86::COND_L; + case X86::COND_GE: return X86::COND_LE; + case X86::COND_B: return X86::COND_A; + case X86::COND_BE: return X86::COND_AE; + case X86::COND_A: return X86::COND_B; + case X86::COND_AE: return X86::COND_BE; + } +} + +/// Return a set opcode for the given condition and +/// whether it has memory operand. +unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) { + static const uint16_t Opc[16][2] = { + { X86::SETAr, X86::SETAm }, + { X86::SETAEr, X86::SETAEm }, + { X86::SETBr, X86::SETBm }, + { X86::SETBEr, X86::SETBEm }, + { X86::SETEr, X86::SETEm }, + { X86::SETGr, X86::SETGm }, + { X86::SETGEr, X86::SETGEm }, + { X86::SETLr, X86::SETLm }, + { X86::SETLEr, X86::SETLEm }, + { X86::SETNEr, X86::SETNEm }, + { X86::SETNOr, X86::SETNOm }, + { X86::SETNPr, X86::SETNPm }, + { X86::SETNSr, X86::SETNSm }, + { X86::SETOr, X86::SETOm }, + { X86::SETPr, X86::SETPm }, + { X86::SETSr, X86::SETSm } + }; + + assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes"); + return Opc[CC][HasMemoryOperand ? 1 : 0]; +} + +/// Return a cmov opcode for the given condition, +/// register size in bytes, and operand type. +unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes, + bool HasMemoryOperand) { + static const uint16_t Opc[32][3] = { + { X86::CMOVA16rr, X86::CMOVA32rr, X86::CMOVA64rr }, + { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr }, + { X86::CMOVB16rr, X86::CMOVB32rr, X86::CMOVB64rr }, + { X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr }, + { X86::CMOVE16rr, X86::CMOVE32rr, X86::CMOVE64rr }, + { X86::CMOVG16rr, X86::CMOVG32rr, X86::CMOVG64rr }, + { X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr }, + { X86::CMOVL16rr, X86::CMOVL32rr, X86::CMOVL64rr }, + { X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr }, + { X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr }, + { X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr }, + { X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr }, + { X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr }, + { X86::CMOVO16rr, X86::CMOVO32rr, X86::CMOVO64rr }, + { X86::CMOVP16rr, X86::CMOVP32rr, X86::CMOVP64rr }, + { X86::CMOVS16rr, X86::CMOVS32rr, X86::CMOVS64rr }, + { X86::CMOVA16rm, X86::CMOVA32rm, X86::CMOVA64rm }, + { X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm }, + { X86::CMOVB16rm, X86::CMOVB32rm, X86::CMOVB64rm }, + { X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm }, + { X86::CMOVE16rm, X86::CMOVE32rm, X86::CMOVE64rm }, + { X86::CMOVG16rm, X86::CMOVG32rm, X86::CMOVG64rm }, + { X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm }, + { X86::CMOVL16rm, X86::CMOVL32rm, X86::CMOVL64rm }, + { X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm }, + { X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm }, + { X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm }, + { X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm }, + { X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm }, + { X86::CMOVO16rm, X86::CMOVO32rm, X86::CMOVO64rm }, + { X86::CMOVP16rm, X86::CMOVP32rm, X86::CMOVP64rm }, + { X86::CMOVS16rm, X86::CMOVS32rm, X86::CMOVS64rm } + }; + + assert(CC < 16 && "Can only handle standard cond codes"); + unsigned Idx = HasMemoryOperand ? 16+CC : CC; + switch(RegBytes) { + default: llvm_unreachable("Illegal register size!"); + case 2: return Opc[Idx][0]; + case 4: return Opc[Idx][1]; + case 8: return Opc[Idx][2]; + } +} + +bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { + if (!MI->isTerminator()) return false; + + // Conditional branch is a special case. + if (MI->isBranch() && !MI->isBarrier()) + return true; + if (!MI->isPredicable()) + return true; + return !isPredicated(MI); +} + +bool X86InstrInfo::AnalyzeBranchImpl( + MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const { + + // Start from the bottom of the block and work up, examining the + // terminator instructions. + MachineBasicBlock::iterator I = MBB.end(); + MachineBasicBlock::iterator UnCondBrIter = MBB.end(); + while (I != MBB.begin()) { + --I; + if (I->isDebugValue()) + continue; + + // Working from the bottom, when we see a non-terminator instruction, we're + // done. + if (!isUnpredicatedTerminator(I)) + break; + + // A terminator that isn't a branch can't easily be handled by this + // analysis. + if (!I->isBranch()) + return true; + + // Handle unconditional branches. + if (I->getOpcode() == X86::JMP_1) { + UnCondBrIter = I; + + if (!AllowModify) { + TBB = I->getOperand(0).getMBB(); + continue; + } + + // If the block has any instructions after a JMP, delete them. + while (std::next(I) != MBB.end()) + std::next(I)->eraseFromParent(); + + Cond.clear(); + FBB = nullptr; + + // Delete the JMP if it's equivalent to a fall-through. + if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) { + TBB = nullptr; + I->eraseFromParent(); + I = MBB.end(); + UnCondBrIter = MBB.end(); + continue; + } + + // TBB is used to indicate the unconditional destination. + TBB = I->getOperand(0).getMBB(); + continue; + } + + // Handle conditional branches. + X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode()); + if (BranchCode == X86::COND_INVALID) + return true; // Can't handle indirect branch. + + // Working from the bottom, handle the first conditional branch. + if (Cond.empty()) { + MachineBasicBlock *TargetBB = I->getOperand(0).getMBB(); + if (AllowModify && UnCondBrIter != MBB.end() && + MBB.isLayoutSuccessor(TargetBB)) { + // If we can modify the code and it ends in something like: + // + // jCC L1 + // jmp L2 + // L1: + // ... + // L2: + // + // Then we can change this to: + // + // jnCC L2 + // L1: + // ... + // L2: + // + // Which is a bit more efficient. + // We conditionally jump to the fall-through block. + BranchCode = GetOppositeBranchCondition(BranchCode); + unsigned JNCC = GetCondBranchFromCond(BranchCode); + MachineBasicBlock::iterator OldInst = I; + + BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC)) + .addMBB(UnCondBrIter->getOperand(0).getMBB()); + BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1)) + .addMBB(TargetBB); + + OldInst->eraseFromParent(); + UnCondBrIter->eraseFromParent(); + + // Restart the analysis. + UnCondBrIter = MBB.end(); + I = MBB.end(); + continue; + } + + FBB = TBB; + TBB = I->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(BranchCode)); + CondBranches.push_back(I); + continue; + } + + // Handle subsequent conditional branches. Only handle the case where all + // conditional branches branch to the same destination and their condition + // opcodes fit one of the special multi-branch idioms. + assert(Cond.size() == 1); + assert(TBB); + + // Only handle the case where all conditional branches branch to the same + // destination. + if (TBB != I->getOperand(0).getMBB()) + return true; + + // If the conditions are the same, we can leave them alone. + X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm(); + if (OldBranchCode == BranchCode) + continue; + + // If they differ, see if they fit one of the known patterns. Theoretically, + // we could handle more patterns here, but we shouldn't expect to see them + // if instruction selection has done a reasonable job. + if ((OldBranchCode == X86::COND_NP && + BranchCode == X86::COND_E) || + (OldBranchCode == X86::COND_E && + BranchCode == X86::COND_NP)) + BranchCode = X86::COND_NP_OR_E; + else if ((OldBranchCode == X86::COND_P && + BranchCode == X86::COND_NE) || + (OldBranchCode == X86::COND_NE && + BranchCode == X86::COND_P)) + BranchCode = X86::COND_NE_OR_P; + else + return true; + + // Update the MachineOperand. + Cond[0].setImm(BranchCode); + CondBranches.push_back(I); + } + + return false; +} + +bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + SmallVector<MachineInstr *, 4> CondBranches; + return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify); +} + +bool X86InstrInfo::AnalyzeBranchPredicate(MachineBasicBlock &MBB, + MachineBranchPredicate &MBP, + bool AllowModify) const { + using namespace std::placeholders; + + SmallVector<MachineOperand, 4> Cond; + SmallVector<MachineInstr *, 4> CondBranches; + if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches, + AllowModify)) + return true; + + if (Cond.size() != 1) + return true; + + assert(MBP.TrueDest && "expected!"); + + if (!MBP.FalseDest) + MBP.FalseDest = MBB.getNextNode(); + + const TargetRegisterInfo *TRI = &getRegisterInfo(); + + MachineInstr *ConditionDef = nullptr; + bool SingleUseCondition = true; + + for (auto I = std::next(MBB.rbegin()), E = MBB.rend(); I != E; ++I) { + if (I->modifiesRegister(X86::EFLAGS, TRI)) { + ConditionDef = &*I; + break; + } + + if (I->readsRegister(X86::EFLAGS, TRI)) + SingleUseCondition = false; + } + + if (!ConditionDef) + return true; + + if (SingleUseCondition) { + for (auto *Succ : MBB.successors()) + if (Succ->isLiveIn(X86::EFLAGS)) + SingleUseCondition = false; + } + + MBP.ConditionDef = ConditionDef; + MBP.SingleUseCondition = SingleUseCondition; + + // Currently we only recognize the simple pattern: + // + // test %reg, %reg + // je %label + // + const unsigned TestOpcode = + Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr; + + if (ConditionDef->getOpcode() == TestOpcode && + ConditionDef->getNumOperands() == 3 && + ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) && + (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) { + MBP.LHS = ConditionDef->getOperand(0); + MBP.RHS = MachineOperand::CreateImm(0); + MBP.Predicate = Cond[0].getImm() == X86::COND_NE + ? MachineBranchPredicate::PRED_NE + : MachineBranchPredicate::PRED_EQ; + return false; + } + + return true; +} + +unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.end(); + unsigned Count = 0; + + while (I != MBB.begin()) { + --I; + if (I->isDebugValue()) + continue; + if (I->getOpcode() != X86::JMP_1 && + getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID) + break; + // Remove the branch. + I->eraseFromParent(); + I = MBB.end(); + ++Count; + } + + return Count; +} + +unsigned +X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, + DebugLoc DL) const { + // Shouldn't be a fall through. + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + assert((Cond.size() == 1 || Cond.size() == 0) && + "X86 branch conditions have one component!"); + + if (Cond.empty()) { + // Unconditional branch? + assert(!FBB && "Unconditional branch with multiple successors!"); + BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB); + return 1; + } + + // Conditional branch. + unsigned Count = 0; + X86::CondCode CC = (X86::CondCode)Cond[0].getImm(); + switch (CC) { + case X86::COND_NP_OR_E: + // Synthesize NP_OR_E with two branches. + BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB); + ++Count; + BuildMI(&MBB, DL, get(X86::JE_1)).addMBB(TBB); + ++Count; + break; + case X86::COND_NE_OR_P: + // Synthesize NE_OR_P with two branches. + BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB); + ++Count; + BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB); + ++Count; + break; + default: { + unsigned Opc = GetCondBranchFromCond(CC); + BuildMI(&MBB, DL, get(Opc)).addMBB(TBB); + ++Count; + } + } + if (FBB) { + // Two-way Conditional branch. Insert the second branch. + BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB); + ++Count; + } + return Count; +} + +bool X86InstrInfo:: +canInsertSelect(const MachineBasicBlock &MBB, + ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg, + int &CondCycles, int &TrueCycles, int &FalseCycles) const { + // Not all subtargets have cmov instructions. + if (!Subtarget.hasCMov()) + return false; + if (Cond.size() != 1) + return false; + // We cannot do the composite conditions, at least not in SSA form. + if ((X86::CondCode)Cond[0].getImm() > X86::COND_S) + return false; + + // Check register classes. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = + RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); + if (!RC) + return false; + + // We have cmov instructions for 16, 32, and 64 bit general purpose registers. + if (X86::GR16RegClass.hasSubClassEq(RC) || + X86::GR32RegClass.hasSubClassEq(RC) || + X86::GR64RegClass.hasSubClassEq(RC)) { + // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy + // Bridge. Probably Ivy Bridge as well. + CondCycles = 2; + TrueCycles = 2; + FalseCycles = 2; + return true; + } + + // Can't do vectors. + return false; +} + +void X86InstrInfo::insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DstReg, ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg) const { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + assert(Cond.size() == 1 && "Invalid Cond array"); + unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(), + MRI.getRegClass(DstReg)->getSize(), + false/*HasMemoryOperand*/); + BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg); +} + +/// Test if the given register is a physical h register. +static bool isHReg(unsigned Reg) { + return X86::GR8_ABCD_HRegClass.contains(Reg); +} + +// Try and copy between VR128/VR64 and GR64 registers. +static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, + const X86Subtarget &Subtarget) { + + // SrcReg(VR128) -> DestReg(GR64) + // SrcReg(VR64) -> DestReg(GR64) + // SrcReg(GR64) -> DestReg(VR128) + // SrcReg(GR64) -> DestReg(VR64) + + bool HasAVX = Subtarget.hasAVX(); + bool HasAVX512 = Subtarget.hasAVX512(); + if (X86::GR64RegClass.contains(DestReg)) { + if (X86::VR128XRegClass.contains(SrcReg)) + // Copy from a VR128 register to a GR64 register. + return HasAVX512 ? X86::VMOVPQIto64Zrr: (HasAVX ? X86::VMOVPQIto64rr : + X86::MOVPQIto64rr); + if (X86::VR64RegClass.contains(SrcReg)) + // Copy from a VR64 register to a GR64 register. + return X86::MMX_MOVD64from64rr; + } else if (X86::GR64RegClass.contains(SrcReg)) { + // Copy from a GR64 register to a VR128 register. + if (X86::VR128XRegClass.contains(DestReg)) + return HasAVX512 ? X86::VMOV64toPQIZrr: (HasAVX ? X86::VMOV64toPQIrr : + X86::MOV64toPQIrr); + // Copy from a GR64 register to a VR64 register. + if (X86::VR64RegClass.contains(DestReg)) + return X86::MMX_MOVD64to64rr; + } + + // SrcReg(FR32) -> DestReg(GR32) + // SrcReg(GR32) -> DestReg(FR32) + + if (X86::GR32RegClass.contains(DestReg) && X86::FR32XRegClass.contains(SrcReg)) + // Copy from a FR32 register to a GR32 register. + return HasAVX512 ? X86::VMOVSS2DIZrr : (HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr); + + if (X86::FR32XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg)) + // Copy from a GR32 register to a FR32 register. + return HasAVX512 ? X86::VMOVDI2SSZrr : (HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr); + return 0; +} + +static bool MaskRegClassContains(unsigned Reg) { + return X86::VK8RegClass.contains(Reg) || + X86::VK16RegClass.contains(Reg) || + X86::VK32RegClass.contains(Reg) || + X86::VK64RegClass.contains(Reg) || + X86::VK1RegClass.contains(Reg); +} + +static bool GRRegClassContains(unsigned Reg) { + return X86::GR64RegClass.contains(Reg) || + X86::GR32RegClass.contains(Reg) || + X86::GR16RegClass.contains(Reg) || + X86::GR8RegClass.contains(Reg); +} +static +unsigned copyPhysRegOpcode_AVX512_DQ(unsigned& DestReg, unsigned& SrcReg) { + if (MaskRegClassContains(SrcReg) && X86::GR8RegClass.contains(DestReg)) { + DestReg = getX86SubSuperRegister(DestReg, 32); + return X86::KMOVBrk; + } + if (MaskRegClassContains(DestReg) && X86::GR8RegClass.contains(SrcReg)) { + SrcReg = getX86SubSuperRegister(SrcReg, 32); + return X86::KMOVBkr; + } + return 0; +} + +static +unsigned copyPhysRegOpcode_AVX512_BW(unsigned& DestReg, unsigned& SrcReg) { + if (MaskRegClassContains(SrcReg) && MaskRegClassContains(DestReg)) + return X86::KMOVQkk; + if (MaskRegClassContains(SrcReg) && X86::GR32RegClass.contains(DestReg)) + return X86::KMOVDrk; + if (MaskRegClassContains(SrcReg) && X86::GR64RegClass.contains(DestReg)) + return X86::KMOVQrk; + if (MaskRegClassContains(DestReg) && X86::GR32RegClass.contains(SrcReg)) + return X86::KMOVDkr; + if (MaskRegClassContains(DestReg) && X86::GR64RegClass.contains(SrcReg)) + return X86::KMOVQkr; + return 0; +} + +static +unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg, + const X86Subtarget &Subtarget) +{ + if (Subtarget.hasDQI()) + if (auto Opc = copyPhysRegOpcode_AVX512_DQ(DestReg, SrcReg)) + return Opc; + if (Subtarget.hasBWI()) + if (auto Opc = copyPhysRegOpcode_AVX512_BW(DestReg, SrcReg)) + return Opc; + if (X86::VR128XRegClass.contains(DestReg, SrcReg) || + X86::VR256XRegClass.contains(DestReg, SrcReg) || + X86::VR512RegClass.contains(DestReg, SrcReg)) { + DestReg = get512BitSuperRegister(DestReg); + SrcReg = get512BitSuperRegister(SrcReg); + return X86::VMOVAPSZrr; + } + if (MaskRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) + return X86::KMOVWkk; + if (MaskRegClassContains(DestReg) && GRRegClassContains(SrcReg)) { + SrcReg = getX86SubSuperRegister(SrcReg, 32); + return X86::KMOVWkr; + } + if (GRRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) { + DestReg = getX86SubSuperRegister(DestReg, 32); + return X86::KMOVWrk; + } + return 0; +} + +void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + // First deal with the normal symmetric copies. + bool HasAVX = Subtarget.hasAVX(); + bool HasAVX512 = Subtarget.hasAVX512(); + unsigned Opc = 0; + if (X86::GR64RegClass.contains(DestReg, SrcReg)) + Opc = X86::MOV64rr; + else if (X86::GR32RegClass.contains(DestReg, SrcReg)) + Opc = X86::MOV32rr; + else if (X86::GR16RegClass.contains(DestReg, SrcReg)) + Opc = X86::MOV16rr; + else if (X86::GR8RegClass.contains(DestReg, SrcReg)) { + // Copying to or from a physical H register on x86-64 requires a NOREX + // move. Otherwise use a normal move. + if ((isHReg(DestReg) || isHReg(SrcReg)) && + Subtarget.is64Bit()) { + Opc = X86::MOV8rr_NOREX; + // Both operands must be encodable without an REX prefix. + assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) && + "8-bit H register can not be copied outside GR8_NOREX"); + } else + Opc = X86::MOV8rr; + } + else if (X86::VR64RegClass.contains(DestReg, SrcReg)) + Opc = X86::MMX_MOVQ64rr; + else if (HasAVX512) + Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg, Subtarget); + else if (X86::VR128RegClass.contains(DestReg, SrcReg)) + Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr; + else if (X86::VR256RegClass.contains(DestReg, SrcReg)) + Opc = X86::VMOVAPSYrr; + if (!Opc) + Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget); + + if (Opc) { + BuildMI(MBB, MI, DL, get(Opc), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + + bool FromEFLAGS = SrcReg == X86::EFLAGS; + bool ToEFLAGS = DestReg == X86::EFLAGS; + int Reg = FromEFLAGS ? DestReg : SrcReg; + bool is32 = X86::GR32RegClass.contains(Reg); + bool is64 = X86::GR64RegClass.contains(Reg); + + if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) { + int Mov = is64 ? X86::MOV64rr : X86::MOV32rr; + int Push = is64 ? X86::PUSH64r : X86::PUSH32r; + int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32; + int Pop = is64 ? X86::POP64r : X86::POP32r; + int PopF = is64 ? X86::POPF64 : X86::POPF32; + int AX = is64 ? X86::RAX : X86::EAX; + + if (!Subtarget.hasLAHFSAHF()) { + assert(Subtarget.is64Bit() && + "Not having LAHF/SAHF only happens on 64-bit."); + // Moving EFLAGS to / from another register requires a push and a pop. + // Notice that we have to adjust the stack if we don't want to clobber the + // first frame index. See X86FrameLowering.cpp - usesTheStack. + if (FromEFLAGS) { + BuildMI(MBB, MI, DL, get(PushF)); + BuildMI(MBB, MI, DL, get(Pop), DestReg); + } + if (ToEFLAGS) { + BuildMI(MBB, MI, DL, get(Push)) + .addReg(SrcReg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(PopF)); + } + return; + } + + // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is + // inefficient. Instead: + // - Save the overflow flag OF into AL using SETO, and restore it using a + // signed 8-bit addition of AL and INT8_MAX. + // - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH + // using LAHF/SAHF. + // - When RAX/EAX is live and isn't the destination register, make sure it + // isn't clobbered by PUSH/POP'ing it before and after saving/restoring + // the flags. + // This approach is ~2.25x faster than using PUSHF/POPF. + // + // This is still somewhat inefficient because we don't know which flags are + // actually live inside EFLAGS. Were we able to do a single SETcc instead of + // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster. + // + // PUSHF/POPF is also potentially incorrect because it affects other flags + // such as TF/IF/DF, which LLVM doesn't model. + // + // Notice that we have to adjust the stack if we don't want to clobber the + // first frame index. + // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment. + + + bool AXDead = (Reg == AX) || + (MachineBasicBlock::LQR_Dead == + MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI)); + if (!AXDead) { + // FIXME: If computeRegisterLiveness() reported LQR_Unknown then AX may + // actually be dead. This is not a problem for correctness as we are just + // (unnecessarily) saving+restoring a dead register. However the + // MachineVerifier expects operands that read from dead registers + // to be marked with the "undef" flag. + // An example of this can be found in + // test/CodeGen/X86/peephole-na-phys-copy-folding.ll and + // test/CodeGen/X86/cmpxchg-clobber-flags.ll when using + // -verify-machineinstrs. + BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true)); + } + if (FromEFLAGS) { + BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL); + BuildMI(MBB, MI, DL, get(X86::LAHF)); + BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX); + } + if (ToEFLAGS) { + BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL) + .addReg(X86::AL) + .addImm(INT8_MAX); + BuildMI(MBB, MI, DL, get(X86::SAHF)); + } + if (!AXDead) + BuildMI(MBB, MI, DL, get(Pop), AX); + return; + } + + DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) + << " to " << RI.getName(DestReg) << '\n'); + llvm_unreachable("Cannot emit physreg copy instruction"); +} + +static unsigned getLoadStoreRegOpcode(unsigned Reg, + const TargetRegisterClass *RC, + bool isStackAligned, + const X86Subtarget &STI, + bool load) { + if (STI.hasAVX512()) { + if (X86::VK8RegClass.hasSubClassEq(RC) || + X86::VK16RegClass.hasSubClassEq(RC)) + return load ? X86::KMOVWkm : X86::KMOVWmk; + if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC)) + return load ? X86::VMOVSSZrm : X86::VMOVSSZmr; + if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC)) + return load ? X86::VMOVSDZrm : X86::VMOVSDZmr; + if (X86::VR512RegClass.hasSubClassEq(RC)) + return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr; + } + + bool HasAVX = STI.hasAVX(); + switch (RC->getSize()) { + default: + llvm_unreachable("Unknown spill size"); + case 1: + assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass"); + if (STI.is64Bit()) + // Copying to or from a physical H register on x86-64 requires a NOREX + // move. Otherwise use a normal move. + if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC)) + return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX; + return load ? X86::MOV8rm : X86::MOV8mr; + case 2: + assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass"); + return load ? X86::MOV16rm : X86::MOV16mr; + case 4: + if (X86::GR32RegClass.hasSubClassEq(RC)) + return load ? X86::MOV32rm : X86::MOV32mr; + if (X86::FR32RegClass.hasSubClassEq(RC)) + return load ? + (HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) : + (HasAVX ? X86::VMOVSSmr : X86::MOVSSmr); + if (X86::RFP32RegClass.hasSubClassEq(RC)) + return load ? X86::LD_Fp32m : X86::ST_Fp32m; + llvm_unreachable("Unknown 4-byte regclass"); + case 8: + if (X86::GR64RegClass.hasSubClassEq(RC)) + return load ? X86::MOV64rm : X86::MOV64mr; + if (X86::FR64RegClass.hasSubClassEq(RC)) + return load ? + (HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) : + (HasAVX ? X86::VMOVSDmr : X86::MOVSDmr); + if (X86::VR64RegClass.hasSubClassEq(RC)) + return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr; + if (X86::RFP64RegClass.hasSubClassEq(RC)) + return load ? X86::LD_Fp64m : X86::ST_Fp64m; + llvm_unreachable("Unknown 8-byte regclass"); + case 10: + assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass"); + return load ? X86::LD_Fp80m : X86::ST_FpP80m; + case 16: { + assert((X86::VR128RegClass.hasSubClassEq(RC) || + X86::VR128XRegClass.hasSubClassEq(RC))&& "Unknown 16-byte regclass"); + // If stack is realigned we can use aligned stores. + if (isStackAligned) + return load ? + (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) : + (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr); + else + return load ? + (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) : + (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr); + } + case 32: + assert((X86::VR256RegClass.hasSubClassEq(RC) || + X86::VR256XRegClass.hasSubClassEq(RC)) && "Unknown 32-byte regclass"); + // If stack is realigned we can use aligned stores. + if (isStackAligned) + return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr; + else + return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr; + case 64: + assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass"); + if (isStackAligned) + return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr; + else + return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr; + } +} + +bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *MemOp, unsigned &BaseReg, + unsigned &Offset, + const TargetRegisterInfo *TRI) const { + const MCInstrDesc &Desc = MemOp->getDesc(); + int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags, MemOp->getOpcode()); + if (MemRefBegin < 0) + return false; + + MemRefBegin += X86II::getOperandBias(Desc); + + BaseReg = MemOp->getOperand(MemRefBegin + X86::AddrBaseReg).getReg(); + if (MemOp->getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1) + return false; + + if (MemOp->getOperand(MemRefBegin + X86::AddrIndexReg).getReg() != + X86::NoRegister) + return false; + + const MachineOperand &DispMO = MemOp->getOperand(MemRefBegin + X86::AddrDisp); + + // Displacement can be symbolic + if (!DispMO.isImm()) + return false; + + Offset = DispMO.getImm(); + + return (MemOp->getOperand(MemRefBegin + X86::AddrIndexReg).getReg() == + X86::NoRegister); +} + +static unsigned getStoreRegOpcode(unsigned SrcReg, + const TargetRegisterClass *RC, + bool isStackAligned, + const X86Subtarget &STI) { + return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, STI, false); +} + + +static unsigned getLoadRegOpcode(unsigned DestReg, + const TargetRegisterClass *RC, + bool isStackAligned, + const X86Subtarget &STI) { + return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, STI, true); +} + +void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, int FrameIdx, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + const MachineFunction &MF = *MBB.getParent(); + assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() && + "Stack slot too small for store"); + unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); + bool isAligned = + (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) || + RI.canRealignStack(MF); + unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); + DebugLoc DL = MBB.findDebugLoc(MI); + addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx) + .addReg(SrcReg, getKillRegState(isKill)); +} + +void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg, + bool isKill, + SmallVectorImpl<MachineOperand> &Addr, + const TargetRegisterClass *RC, + MachineInstr::mmo_iterator MMOBegin, + MachineInstr::mmo_iterator MMOEnd, + SmallVectorImpl<MachineInstr*> &NewMIs) const { + unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); + bool isAligned = MMOBegin != MMOEnd && + (*MMOBegin)->getAlignment() >= Alignment; + unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); + DebugLoc DL; + MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc)); + for (unsigned i = 0, e = Addr.size(); i != e; ++i) + MIB.addOperand(Addr[i]); + MIB.addReg(SrcReg, getKillRegState(isKill)); + (*MIB).setMemRefs(MMOBegin, MMOEnd); + NewMIs.push_back(MIB); +} + + +void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIdx, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + const MachineFunction &MF = *MBB.getParent(); + unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); + bool isAligned = + (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) || + RI.canRealignStack(MF); + unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); + DebugLoc DL = MBB.findDebugLoc(MI); + addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx); +} + +void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg, + SmallVectorImpl<MachineOperand> &Addr, + const TargetRegisterClass *RC, + MachineInstr::mmo_iterator MMOBegin, + MachineInstr::mmo_iterator MMOEnd, + SmallVectorImpl<MachineInstr*> &NewMIs) const { + unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); + bool isAligned = MMOBegin != MMOEnd && + (*MMOBegin)->getAlignment() >= Alignment; + unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); + DebugLoc DL; + MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg); + for (unsigned i = 0, e = Addr.size(); i != e; ++i) + MIB.addOperand(Addr[i]); + (*MIB).setMemRefs(MMOBegin, MMOEnd); + NewMIs.push_back(MIB); +} + +bool X86InstrInfo:: +analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, + int &CmpMask, int &CmpValue) const { + switch (MI->getOpcode()) { + default: break; + case X86::CMP64ri32: + case X86::CMP64ri8: + case X86::CMP32ri: + case X86::CMP32ri8: + case X86::CMP16ri: + case X86::CMP16ri8: + case X86::CMP8ri: + SrcReg = MI->getOperand(0).getReg(); + SrcReg2 = 0; + CmpMask = ~0; + CmpValue = MI->getOperand(1).getImm(); + return true; + // A SUB can be used to perform comparison. + case X86::SUB64rm: + case X86::SUB32rm: + case X86::SUB16rm: + case X86::SUB8rm: + SrcReg = MI->getOperand(1).getReg(); + SrcReg2 = 0; + CmpMask = ~0; + CmpValue = 0; + return true; + case X86::SUB64rr: + case X86::SUB32rr: + case X86::SUB16rr: + case X86::SUB8rr: + SrcReg = MI->getOperand(1).getReg(); + SrcReg2 = MI->getOperand(2).getReg(); + CmpMask = ~0; + CmpValue = 0; + return true; + case X86::SUB64ri32: + case X86::SUB64ri8: + case X86::SUB32ri: + case X86::SUB32ri8: + case X86::SUB16ri: + case X86::SUB16ri8: + case X86::SUB8ri: + SrcReg = MI->getOperand(1).getReg(); + SrcReg2 = 0; + CmpMask = ~0; + CmpValue = MI->getOperand(2).getImm(); + return true; + case X86::CMP64rr: + case X86::CMP32rr: + case X86::CMP16rr: + case X86::CMP8rr: + SrcReg = MI->getOperand(0).getReg(); + SrcReg2 = MI->getOperand(1).getReg(); + CmpMask = ~0; + CmpValue = 0; + return true; + case X86::TEST8rr: + case X86::TEST16rr: + case X86::TEST32rr: + case X86::TEST64rr: + SrcReg = MI->getOperand(0).getReg(); + if (MI->getOperand(1).getReg() != SrcReg) return false; + // Compare against zero. + SrcReg2 = 0; + CmpMask = ~0; + CmpValue = 0; + return true; + } + return false; +} + +/// Check whether the first instruction, whose only +/// purpose is to update flags, can be made redundant. +/// CMPrr can be made redundant by SUBrr if the operands are the same. +/// This function can be extended later on. +/// SrcReg, SrcRegs: register operands for FlagI. +/// ImmValue: immediate for FlagI if it takes an immediate. +inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg, + unsigned SrcReg2, int ImmValue, + MachineInstr *OI) { + if (((FlagI->getOpcode() == X86::CMP64rr && + OI->getOpcode() == X86::SUB64rr) || + (FlagI->getOpcode() == X86::CMP32rr && + OI->getOpcode() == X86::SUB32rr)|| + (FlagI->getOpcode() == X86::CMP16rr && + OI->getOpcode() == X86::SUB16rr)|| + (FlagI->getOpcode() == X86::CMP8rr && + OI->getOpcode() == X86::SUB8rr)) && + ((OI->getOperand(1).getReg() == SrcReg && + OI->getOperand(2).getReg() == SrcReg2) || + (OI->getOperand(1).getReg() == SrcReg2 && + OI->getOperand(2).getReg() == SrcReg))) + return true; + + if (((FlagI->getOpcode() == X86::CMP64ri32 && + OI->getOpcode() == X86::SUB64ri32) || + (FlagI->getOpcode() == X86::CMP64ri8 && + OI->getOpcode() == X86::SUB64ri8) || + (FlagI->getOpcode() == X86::CMP32ri && + OI->getOpcode() == X86::SUB32ri) || + (FlagI->getOpcode() == X86::CMP32ri8 && + OI->getOpcode() == X86::SUB32ri8) || + (FlagI->getOpcode() == X86::CMP16ri && + OI->getOpcode() == X86::SUB16ri) || + (FlagI->getOpcode() == X86::CMP16ri8 && + OI->getOpcode() == X86::SUB16ri8) || + (FlagI->getOpcode() == X86::CMP8ri && + OI->getOpcode() == X86::SUB8ri)) && + OI->getOperand(1).getReg() == SrcReg && + OI->getOperand(2).getImm() == ImmValue) + return true; + return false; +} + +/// Check whether the definition can be converted +/// to remove a comparison against zero. +inline static bool isDefConvertible(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: return false; + + // The shift instructions only modify ZF if their shift count is non-zero. + // N.B.: The processor truncates the shift count depending on the encoding. + case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri:case X86::SAR64ri: + case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri:case X86::SHR64ri: + return getTruncatedShiftCount(MI, 2) != 0; + + // Some left shift instructions can be turned into LEA instructions but only + // if their flags aren't used. Avoid transforming such instructions. + case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri:case X86::SHL64ri:{ + unsigned ShAmt = getTruncatedShiftCount(MI, 2); + if (isTruncatedShiftCountForLEA(ShAmt)) return false; + return ShAmt != 0; + } + + case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8: + case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8: + return getTruncatedShiftCount(MI, 3) != 0; + + case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri: + case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8: + case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr: + case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm: + case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm: + case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r: + case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri: + case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8: + case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr: + case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm: + case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm: + case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r: + case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri: + case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8: + case X86::AND8ri: case X86::AND64rr: case X86::AND32rr: + case X86::AND16rr: case X86::AND8rr: case X86::AND64rm: + case X86::AND32rm: case X86::AND16rm: case X86::AND8rm: + case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri: + case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8: + case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr: + case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm: + case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm: + case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri: + case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8: + case X86::OR8ri: case X86::OR64rr: case X86::OR32rr: + case X86::OR16rr: case X86::OR8rr: case X86::OR64rm: + case X86::OR32rm: case X86::OR16rm: case X86::OR8rm: + case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r: + case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1: + case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1: + case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1: + case X86::ADC32ri: case X86::ADC32ri8: + case X86::ADC32rr: case X86::ADC64ri32: + case X86::ADC64ri8: case X86::ADC64rr: + case X86::SBB32ri: case X86::SBB32ri8: + case X86::SBB32rr: case X86::SBB64ri32: + case X86::SBB64ri8: case X86::SBB64rr: + case X86::ANDN32rr: case X86::ANDN32rm: + case X86::ANDN64rr: case X86::ANDN64rm: + case X86::BEXTR32rr: case X86::BEXTR64rr: + case X86::BEXTR32rm: case X86::BEXTR64rm: + case X86::BLSI32rr: case X86::BLSI32rm: + case X86::BLSI64rr: case X86::BLSI64rm: + case X86::BLSMSK32rr:case X86::BLSMSK32rm: + case X86::BLSMSK64rr:case X86::BLSMSK64rm: + case X86::BLSR32rr: case X86::BLSR32rm: + case X86::BLSR64rr: case X86::BLSR64rm: + case X86::BZHI32rr: case X86::BZHI32rm: + case X86::BZHI64rr: case X86::BZHI64rm: + case X86::LZCNT16rr: case X86::LZCNT16rm: + case X86::LZCNT32rr: case X86::LZCNT32rm: + case X86::LZCNT64rr: case X86::LZCNT64rm: + case X86::POPCNT16rr:case X86::POPCNT16rm: + case X86::POPCNT32rr:case X86::POPCNT32rm: + case X86::POPCNT64rr:case X86::POPCNT64rm: + case X86::TZCNT16rr: case X86::TZCNT16rm: + case X86::TZCNT32rr: case X86::TZCNT32rm: + case X86::TZCNT64rr: case X86::TZCNT64rm: + return true; + } +} + +/// Check whether the use can be converted to remove a comparison against zero. +static X86::CondCode isUseDefConvertible(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: return X86::COND_INVALID; + case X86::LZCNT16rr: case X86::LZCNT16rm: + case X86::LZCNT32rr: case X86::LZCNT32rm: + case X86::LZCNT64rr: case X86::LZCNT64rm: + return X86::COND_B; + case X86::POPCNT16rr:case X86::POPCNT16rm: + case X86::POPCNT32rr:case X86::POPCNT32rm: + case X86::POPCNT64rr:case X86::POPCNT64rm: + return X86::COND_E; + case X86::TZCNT16rr: case X86::TZCNT16rm: + case X86::TZCNT32rr: case X86::TZCNT32rm: + case X86::TZCNT64rr: case X86::TZCNT64rm: + return X86::COND_B; + } +} + +/// Check if there exists an earlier instruction that +/// operates on the same source operands and sets flags in the same way as +/// Compare; remove Compare if possible. +bool X86InstrInfo:: +optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, + int CmpMask, int CmpValue, + const MachineRegisterInfo *MRI) const { + // Check whether we can replace SUB with CMP. + unsigned NewOpcode = 0; + switch (CmpInstr->getOpcode()) { + default: break; + case X86::SUB64ri32: + case X86::SUB64ri8: + case X86::SUB32ri: + case X86::SUB32ri8: + case X86::SUB16ri: + case X86::SUB16ri8: + case X86::SUB8ri: + case X86::SUB64rm: + case X86::SUB32rm: + case X86::SUB16rm: + case X86::SUB8rm: + case X86::SUB64rr: + case X86::SUB32rr: + case X86::SUB16rr: + case X86::SUB8rr: { + if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg())) + return false; + // There is no use of the destination register, we can replace SUB with CMP. + switch (CmpInstr->getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::SUB64rm: NewOpcode = X86::CMP64rm; break; + case X86::SUB32rm: NewOpcode = X86::CMP32rm; break; + case X86::SUB16rm: NewOpcode = X86::CMP16rm; break; + case X86::SUB8rm: NewOpcode = X86::CMP8rm; break; + case X86::SUB64rr: NewOpcode = X86::CMP64rr; break; + case X86::SUB32rr: NewOpcode = X86::CMP32rr; break; + case X86::SUB16rr: NewOpcode = X86::CMP16rr; break; + case X86::SUB8rr: NewOpcode = X86::CMP8rr; break; + case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break; + case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break; + case X86::SUB32ri: NewOpcode = X86::CMP32ri; break; + case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break; + case X86::SUB16ri: NewOpcode = X86::CMP16ri; break; + case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break; + case X86::SUB8ri: NewOpcode = X86::CMP8ri; break; + } + CmpInstr->setDesc(get(NewOpcode)); + CmpInstr->RemoveOperand(0); + // Fall through to optimize Cmp if Cmp is CMPrr or CMPri. + if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm || + NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm) + return false; + } + } + + // Get the unique definition of SrcReg. + MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); + if (!MI) return false; + + // CmpInstr is the first instruction of the BB. + MachineBasicBlock::iterator I = CmpInstr, Def = MI; + + // If we are comparing against zero, check whether we can use MI to update + // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize. + bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0); + if (IsCmpZero && MI->getParent() != CmpInstr->getParent()) + return false; + + // If we have a use of the source register between the def and our compare + // instruction we can eliminate the compare iff the use sets EFLAGS in the + // right way. + bool ShouldUpdateCC = false; + X86::CondCode NewCC = X86::COND_INVALID; + if (IsCmpZero && !isDefConvertible(MI)) { + // Scan forward from the use until we hit the use we're looking for or the + // compare instruction. + for (MachineBasicBlock::iterator J = MI;; ++J) { + // Do we have a convertible instruction? + NewCC = isUseDefConvertible(J); + if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() && + J->getOperand(1).getReg() == SrcReg) { + assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!"); + ShouldUpdateCC = true; // Update CC later on. + // This is not a def of SrcReg, but still a def of EFLAGS. Keep going + // with the new def. + MI = Def = J; + break; + } + + if (J == I) + return false; + } + } + + // We are searching for an earlier instruction that can make CmpInstr + // redundant and that instruction will be saved in Sub. + MachineInstr *Sub = nullptr; + const TargetRegisterInfo *TRI = &getRegisterInfo(); + + // We iterate backward, starting from the instruction before CmpInstr and + // stop when reaching the definition of a source register or done with the BB. + // RI points to the instruction before CmpInstr. + // If the definition is in this basic block, RE points to the definition; + // otherwise, RE is the rend of the basic block. + MachineBasicBlock::reverse_iterator + RI = MachineBasicBlock::reverse_iterator(I), + RE = CmpInstr->getParent() == MI->getParent() ? + MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ : + CmpInstr->getParent()->rend(); + MachineInstr *Movr0Inst = nullptr; + for (; RI != RE; ++RI) { + MachineInstr *Instr = &*RI; + // Check whether CmpInstr can be made redundant by the current instruction. + if (!IsCmpZero && + isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) { + Sub = Instr; + break; + } + + if (Instr->modifiesRegister(X86::EFLAGS, TRI) || + Instr->readsRegister(X86::EFLAGS, TRI)) { + // This instruction modifies or uses EFLAGS. + + // MOV32r0 etc. are implemented with xor which clobbers condition code. + // They are safe to move up, if the definition to EFLAGS is dead and + // earlier instructions do not read or write EFLAGS. + if (!Movr0Inst && Instr->getOpcode() == X86::MOV32r0 && + Instr->registerDefIsDead(X86::EFLAGS, TRI)) { + Movr0Inst = Instr; + continue; + } + + // We can't remove CmpInstr. + return false; + } + } + + // Return false if no candidates exist. + if (!IsCmpZero && !Sub) + return false; + + bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 && + Sub->getOperand(2).getReg() == SrcReg); + + // Scan forward from the instruction after CmpInstr for uses of EFLAGS. + // It is safe to remove CmpInstr if EFLAGS is redefined or killed. + // If we are done with the basic block, we need to check whether EFLAGS is + // live-out. + bool IsSafe = false; + SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate; + MachineBasicBlock::iterator E = CmpInstr->getParent()->end(); + for (++I; I != E; ++I) { + const MachineInstr &Instr = *I; + bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI); + bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI); + // We should check the usage if this instruction uses and updates EFLAGS. + if (!UseEFLAGS && ModifyEFLAGS) { + // It is safe to remove CmpInstr if EFLAGS is updated again. + IsSafe = true; + break; + } + if (!UseEFLAGS && !ModifyEFLAGS) + continue; + + // EFLAGS is used by this instruction. + X86::CondCode OldCC = X86::COND_INVALID; + bool OpcIsSET = false; + if (IsCmpZero || IsSwapped) { + // We decode the condition code from opcode. + if (Instr.isBranch()) + OldCC = getCondFromBranchOpc(Instr.getOpcode()); + else { + OldCC = getCondFromSETOpc(Instr.getOpcode()); + if (OldCC != X86::COND_INVALID) + OpcIsSET = true; + else + OldCC = X86::getCondFromCMovOpc(Instr.getOpcode()); + } + if (OldCC == X86::COND_INVALID) return false; + } + if (IsCmpZero) { + switch (OldCC) { + default: break; + case X86::COND_A: case X86::COND_AE: + case X86::COND_B: case X86::COND_BE: + case X86::COND_G: case X86::COND_GE: + case X86::COND_L: case X86::COND_LE: + case X86::COND_O: case X86::COND_NO: + // CF and OF are used, we can't perform this optimization. + return false; + } + + // If we're updating the condition code check if we have to reverse the + // condition. + if (ShouldUpdateCC) + switch (OldCC) { + default: + return false; + case X86::COND_E: + break; + case X86::COND_NE: + NewCC = GetOppositeBranchCondition(NewCC); + break; + } + } else if (IsSwapped) { + // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs + // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc. + // We swap the condition code and synthesize the new opcode. + NewCC = getSwappedCondition(OldCC); + if (NewCC == X86::COND_INVALID) return false; + } + + if ((ShouldUpdateCC || IsSwapped) && NewCC != OldCC) { + // Synthesize the new opcode. + bool HasMemoryOperand = Instr.hasOneMemOperand(); + unsigned NewOpc; + if (Instr.isBranch()) + NewOpc = GetCondBranchFromCond(NewCC); + else if(OpcIsSET) + NewOpc = getSETFromCond(NewCC, HasMemoryOperand); + else { + unsigned DstReg = Instr.getOperand(0).getReg(); + NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(), + HasMemoryOperand); + } + + // Push the MachineInstr to OpsToUpdate. + // If it is safe to remove CmpInstr, the condition code of these + // instructions will be modified. + OpsToUpdate.push_back(std::make_pair(&*I, NewOpc)); + } + if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) { + // It is safe to remove CmpInstr if EFLAGS is updated again or killed. + IsSafe = true; + break; + } + } + + // If EFLAGS is not killed nor re-defined, we should check whether it is + // live-out. If it is live-out, do not optimize. + if ((IsCmpZero || IsSwapped) && !IsSafe) { + MachineBasicBlock *MBB = CmpInstr->getParent(); + for (MachineBasicBlock *Successor : MBB->successors()) + if (Successor->isLiveIn(X86::EFLAGS)) + return false; + } + + // The instruction to be updated is either Sub or MI. + Sub = IsCmpZero ? MI : Sub; + // Move Movr0Inst to the appropriate place before Sub. + if (Movr0Inst) { + // Look backwards until we find a def that doesn't use the current EFLAGS. + Def = Sub; + MachineBasicBlock::reverse_iterator + InsertI = MachineBasicBlock::reverse_iterator(++Def), + InsertE = Sub->getParent()->rend(); + for (; InsertI != InsertE; ++InsertI) { + MachineInstr *Instr = &*InsertI; + if (!Instr->readsRegister(X86::EFLAGS, TRI) && + Instr->modifiesRegister(X86::EFLAGS, TRI)) { + Sub->getParent()->remove(Movr0Inst); + Instr->getParent()->insert(MachineBasicBlock::iterator(Instr), + Movr0Inst); + break; + } + } + if (InsertI == InsertE) + return false; + } + + // Make sure Sub instruction defines EFLAGS and mark the def live. + unsigned i = 0, e = Sub->getNumOperands(); + for (; i != e; ++i) { + MachineOperand &MO = Sub->getOperand(i); + if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) { + MO.setIsDead(false); + break; + } + } + assert(i != e && "Unable to locate a def EFLAGS operand"); + + CmpInstr->eraseFromParent(); + + // Modify the condition code of instructions in OpsToUpdate. + for (auto &Op : OpsToUpdate) + Op.first->setDesc(get(Op.second)); + return true; +} + +/// Try to remove the load by folding it to a register +/// operand at the use. We fold the load instructions if load defines a virtual +/// register, the virtual register is used once in the same BB, and the +/// instructions in-between do not load or store, and have no side effects. +MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI, + const MachineRegisterInfo *MRI, + unsigned &FoldAsLoadDefReg, + MachineInstr *&DefMI) const { + if (FoldAsLoadDefReg == 0) + return nullptr; + // To be conservative, if there exists another load, clear the load candidate. + if (MI->mayLoad()) { + FoldAsLoadDefReg = 0; + return nullptr; + } + + // Check whether we can move DefMI here. + DefMI = MRI->getVRegDef(FoldAsLoadDefReg); + assert(DefMI); + bool SawStore = false; + if (!DefMI->isSafeToMove(nullptr, SawStore)) + return nullptr; + + // Collect information about virtual register operands of MI. + unsigned SrcOperandId = 0; + bool FoundSrcOperand = false; + for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (Reg != FoldAsLoadDefReg) + continue; + // Do not fold if we have a subreg use or a def or multiple uses. + if (MO.getSubReg() || MO.isDef() || FoundSrcOperand) + return nullptr; + + SrcOperandId = i; + FoundSrcOperand = true; + } + if (!FoundSrcOperand) + return nullptr; + + // Check whether we can fold the def into SrcOperandId. + if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI)) { + FoldAsLoadDefReg = 0; + return FoldMI; + } + + return nullptr; +} + +/// Expand a single-def pseudo instruction to a two-addr +/// instruction with two undef reads of the register being defined. +/// This is used for mapping: +/// %xmm4 = V_SET0 +/// to: +/// %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef> +/// +static bool Expand2AddrUndef(MachineInstrBuilder &MIB, + const MCInstrDesc &Desc) { + assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); + unsigned Reg = MIB->getOperand(0).getReg(); + MIB->setDesc(Desc); + + // MachineInstr::addOperand() will insert explicit operands before any + // implicit operands. + MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); + // But we don't trust that. + assert(MIB->getOperand(1).getReg() == Reg && + MIB->getOperand(2).getReg() == Reg && "Misplaced operand"); + return true; +} + +/// Expand a single-def pseudo instruction to a two-addr +/// instruction with two %k0 reads. +/// This is used for mapping: +/// %k4 = K_SET1 +/// to: +/// %k4 = KXNORrr %k0, %k0 +static bool Expand2AddrKreg(MachineInstrBuilder &MIB, + const MCInstrDesc &Desc, unsigned Reg) { + assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); + MIB->setDesc(Desc); + MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); + return true; +} + +static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, + bool MinusOne) { + MachineBasicBlock &MBB = *MIB->getParent(); + DebugLoc DL = MIB->getDebugLoc(); + unsigned Reg = MIB->getOperand(0).getReg(); + + // Insert the XOR. + BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); + + // Turn the pseudo into an INC or DEC. + MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r)); + MIB.addReg(Reg); + + return true; +} + +// LoadStackGuard has so far only been implemented for 64-bit MachO. Different +// code sequence is needed for other targets. +static void expandLoadStackGuard(MachineInstrBuilder &MIB, + const TargetInstrInfo &TII) { + MachineBasicBlock &MBB = *MIB->getParent(); + DebugLoc DL = MIB->getDebugLoc(); + unsigned Reg = MIB->getOperand(0).getReg(); + const GlobalValue *GV = + cast<GlobalValue>((*MIB->memoperands_begin())->getValue()); + unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; + MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( + MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 8, 8); + MachineBasicBlock::iterator I = MIB.getInstr(); + + BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1) + .addReg(0).addGlobalAddress(GV, 0, X86II::MO_GOTPCREL).addReg(0) + .addMemOperand(MMO); + MIB->setDebugLoc(DL); + MIB->setDesc(TII.get(X86::MOV64rm)); + MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0); +} + +bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + bool HasAVX = Subtarget.hasAVX(); + MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); + switch (MI->getOpcode()) { + case X86::MOV32r0: + return Expand2AddrUndef(MIB, get(X86::XOR32rr)); + case X86::MOV32r1: + return expandMOV32r1(MIB, *this, /*MinusOne=*/ false); + case X86::MOV32r_1: + return expandMOV32r1(MIB, *this, /*MinusOne=*/ true); + case X86::SETB_C8r: + return Expand2AddrUndef(MIB, get(X86::SBB8rr)); + case X86::SETB_C16r: + return Expand2AddrUndef(MIB, get(X86::SBB16rr)); + case X86::SETB_C32r: + return Expand2AddrUndef(MIB, get(X86::SBB32rr)); + case X86::SETB_C64r: + return Expand2AddrUndef(MIB, get(X86::SBB64rr)); + case X86::V_SET0: + case X86::FsFLD0SS: + case X86::FsFLD0SD: + return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr)); + case X86::AVX_SET0: + assert(HasAVX && "AVX not supported"); + return Expand2AddrUndef(MIB, get(X86::VXORPSYrr)); + case X86::AVX512_512_SET0: + return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); + case X86::V_SETALLONES: + return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); + case X86::AVX2_SETALLONES: + return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr)); + case X86::TEST8ri_NOREX: + MI->setDesc(get(X86::TEST8ri)); + return true; + case X86::MOV32ri64: + MI->setDesc(get(X86::MOV32ri)); + return true; + + // KNL does not recognize dependency-breaking idioms for mask registers, + // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1. + // Using %k0 as the undef input register is a performance heuristic based + // on the assumption that %k0 is used less frequently than the other mask + // registers, since it is not usable as a write mask. + // FIXME: A more advanced approach would be to choose the best input mask + // register based on context. + case X86::KSET0B: + case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0); + case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0); + case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0); + case X86::KSET1B: + case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0); + case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0); + case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0); + case TargetOpcode::LOAD_STACK_GUARD: + expandLoadStackGuard(MIB, *this); + return true; + } + return false; +} + +static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs, + int PtrOffset = 0) { + unsigned NumAddrOps = MOs.size(); + + if (NumAddrOps < 4) { + // FrameIndex only - add an immediate offset (whether its zero or not). + for (unsigned i = 0; i != NumAddrOps; ++i) + MIB.addOperand(MOs[i]); + addOffset(MIB, PtrOffset); + } else { + // General Memory Addressing - we need to add any offset to an existing + // offset. + assert(MOs.size() == 5 && "Unexpected memory operand list length"); + for (unsigned i = 0; i != NumAddrOps; ++i) { + const MachineOperand &MO = MOs[i]; + if (i == 3 && PtrOffset != 0) { + MIB.addDisp(MO, PtrOffset); + } else { + MIB.addOperand(MO); + } + } + } +} + +static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, + ArrayRef<MachineOperand> MOs, + MachineBasicBlock::iterator InsertPt, + MachineInstr *MI, + const TargetInstrInfo &TII) { + // Create the base instruction with the memory operand as the first part. + // Omit the implicit operands, something BuildMI can't do. + MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), + MI->getDebugLoc(), true); + MachineInstrBuilder MIB(MF, NewMI); + addOperands(MIB, MOs); + + // Loop over the rest of the ri operands, converting them over. + unsigned NumOps = MI->getDesc().getNumOperands()-2; + for (unsigned i = 0; i != NumOps; ++i) { + MachineOperand &MO = MI->getOperand(i+2); + MIB.addOperand(MO); + } + for (unsigned i = NumOps+2, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + MIB.addOperand(MO); + } + + MachineBasicBlock *MBB = InsertPt->getParent(); + MBB->insert(InsertPt, NewMI); + + return MIB; +} + +static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, + unsigned OpNo, ArrayRef<MachineOperand> MOs, + MachineBasicBlock::iterator InsertPt, + MachineInstr *MI, const TargetInstrInfo &TII, + int PtrOffset = 0) { + // Omit the implicit operands, something BuildMI can't do. + MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), + MI->getDebugLoc(), true); + MachineInstrBuilder MIB(MF, NewMI); + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (i == OpNo) { + assert(MO.isReg() && "Expected to fold into reg operand!"); + addOperands(MIB, MOs, PtrOffset); + } else { + MIB.addOperand(MO); + } + } + + MachineBasicBlock *MBB = InsertPt->getParent(); + MBB->insert(InsertPt, NewMI); + + return MIB; +} + +static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, + ArrayRef<MachineOperand> MOs, + MachineBasicBlock::iterator InsertPt, + MachineInstr *MI) { + MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt, + MI->getDebugLoc(), TII.get(Opcode)); + addOperands(MIB, MOs); + return MIB.addImm(0); +} + +MachineInstr *X86InstrInfo::foldMemoryOperandCustom( + MachineFunction &MF, MachineInstr *MI, unsigned OpNum, + ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, + unsigned Size, unsigned Align) const { + switch (MI->getOpcode()) { + case X86::INSERTPSrr: + case X86::VINSERTPSrr: + // Attempt to convert the load of inserted vector into a fold load + // of a single float. + if (OpNum == 2) { + unsigned Imm = MI->getOperand(MI->getNumOperands() - 1).getImm(); + unsigned ZMask = Imm & 15; + unsigned DstIdx = (Imm >> 4) & 3; + unsigned SrcIdx = (Imm >> 6) & 3; + + unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize(); + if (Size <= RCSize && 4 <= Align) { + int PtrOffset = SrcIdx * 4; + unsigned NewImm = (DstIdx << 4) | ZMask; + unsigned NewOpCode = + (MI->getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm + : X86::INSERTPSrm); + MachineInstr *NewMI = + FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset); + NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm); + return NewMI; + } + } + break; + }; + + return nullptr; +} + +MachineInstr *X86InstrInfo::foldMemoryOperandImpl( + MachineFunction &MF, MachineInstr *MI, unsigned OpNum, + ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, + unsigned Size, unsigned Align, bool AllowCommute) const { + const DenseMap<unsigned, + std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr; + bool isCallRegIndirect = Subtarget.callRegIndirect(); + bool isTwoAddrFold = false; + + // For CPUs that favor the register form of a call or push, + // do not fold loads into calls or pushes, unless optimizing for size + // aggressively. + if (isCallRegIndirect && !MF.getFunction()->optForMinSize() && + (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r || + MI->getOpcode() == X86::PUSH16r || MI->getOpcode() == X86::PUSH32r || + MI->getOpcode() == X86::PUSH64r)) + return nullptr; + + unsigned NumOps = MI->getDesc().getNumOperands(); + bool isTwoAddr = NumOps > 1 && + MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1; + + // FIXME: AsmPrinter doesn't know how to handle + // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. + if (MI->getOpcode() == X86::ADD32ri && + MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) + return nullptr; + + MachineInstr *NewMI = nullptr; + + // Attempt to fold any custom cases we have. + if (MachineInstr *CustomMI = + foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align)) + return CustomMI; + + // Folding a memory location into the two-address part of a two-address + // instruction is different than folding it other places. It requires + // replacing the *two* registers with the memory location. + if (isTwoAddr && NumOps >= 2 && OpNum < 2 && + MI->getOperand(0).isReg() && + MI->getOperand(1).isReg() && + MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) { + OpcodeTablePtr = &RegOp2MemOpTable2Addr; + isTwoAddrFold = true; + } else if (OpNum == 0) { + if (MI->getOpcode() == X86::MOV32r0) { + NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI); + if (NewMI) + return NewMI; + } + + OpcodeTablePtr = &RegOp2MemOpTable0; + } else if (OpNum == 1) { + OpcodeTablePtr = &RegOp2MemOpTable1; + } else if (OpNum == 2) { + OpcodeTablePtr = &RegOp2MemOpTable2; + } else if (OpNum == 3) { + OpcodeTablePtr = &RegOp2MemOpTable3; + } else if (OpNum == 4) { + OpcodeTablePtr = &RegOp2MemOpTable4; + } + + // If table selected... + if (OpcodeTablePtr) { + // Find the Opcode to fuse + DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I = + OpcodeTablePtr->find(MI->getOpcode()); + if (I != OpcodeTablePtr->end()) { + unsigned Opcode = I->second.first; + unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT; + if (Align < MinAlign) + return nullptr; + bool NarrowToMOV32rm = false; + if (Size) { + unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize(); + if (Size < RCSize) { + // Check if it's safe to fold the load. If the size of the object is + // narrower than the load width, then it's not. + if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) + return nullptr; + // If this is a 64-bit load, but the spill slot is 32, then we can do + // a 32-bit load which is implicitly zero-extended. This likely is + // due to live interval analysis remat'ing a load from stack slot. + if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg()) + return nullptr; + Opcode = X86::MOV32rm; + NarrowToMOV32rm = true; + } + } + + if (isTwoAddrFold) + NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this); + else + NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this); + + if (NarrowToMOV32rm) { + // If this is the special case where we use a MOV32rm to load a 32-bit + // value and zero-extend the top bits. Change the destination register + // to a 32-bit one. + unsigned DstReg = NewMI->getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(DstReg)) + NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit)); + else + NewMI->getOperand(0).setSubReg(X86::sub_32bit); + } + return NewMI; + } + } + + // If the instruction and target operand are commutable, commute the + // instruction and try again. + if (AllowCommute) { + unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex; + if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) { + bool HasDef = MI->getDesc().getNumDefs(); + unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0; + unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg(); + unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg(); + bool Tied1 = + 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); + bool Tied2 = + 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO); + + // If either of the commutable operands are tied to the destination + // then we can not commute + fold. + if ((HasDef && Reg0 == Reg1 && Tied1) || + (HasDef && Reg0 == Reg2 && Tied2)) + return nullptr; + + MachineInstr *CommutedMI = + commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); + if (!CommutedMI) { + // Unable to commute. + return nullptr; + } + if (CommutedMI != MI) { + // New instruction. We can't fold from this. + CommutedMI->eraseFromParent(); + return nullptr; + } + + // Attempt to fold with the commuted version of the instruction. + NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, + Size, Align, /*AllowCommute=*/false); + if (NewMI) + return NewMI; + + // Folding failed again - undo the commute before returning. + MachineInstr *UncommutedMI = + commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); + if (!UncommutedMI) { + // Unable to commute. + return nullptr; + } + if (UncommutedMI != MI) { + // New instruction. It doesn't need to be kept. + UncommutedMI->eraseFromParent(); + return nullptr; + } + + // Return here to prevent duplicate fuse failure report. + return nullptr; + } + } + + // No fusion + if (PrintFailedFusing && !MI->isCopy()) + dbgs() << "We failed to fuse operand " << OpNum << " in " << *MI; + return nullptr; +} + +/// Return true for all instructions that only update +/// the first 32 or 64-bits of the destination register and leave the rest +/// unmodified. This can be used to avoid folding loads if the instructions +/// only update part of the destination register, and the non-updated part is +/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these +/// instructions breaks the partial register dependency and it can improve +/// performance. e.g.: +/// +/// movss (%rdi), %xmm0 +/// cvtss2sd %xmm0, %xmm0 +/// +/// Instead of +/// cvtss2sd (%rdi), %xmm0 +/// +/// FIXME: This should be turned into a TSFlags. +/// +static bool hasPartialRegUpdate(unsigned Opcode) { + switch (Opcode) { + case X86::CVTSI2SSrr: + case X86::CVTSI2SSrm: + case X86::CVTSI2SS64rr: + case X86::CVTSI2SS64rm: + case X86::CVTSI2SDrr: + case X86::CVTSI2SDrm: + case X86::CVTSI2SD64rr: + case X86::CVTSI2SD64rm: + case X86::CVTSD2SSrr: + case X86::CVTSD2SSrm: + case X86::Int_CVTSD2SSrr: + case X86::Int_CVTSD2SSrm: + case X86::CVTSS2SDrr: + case X86::CVTSS2SDrm: + case X86::Int_CVTSS2SDrr: + case X86::Int_CVTSS2SDrm: + case X86::RCPSSr: + case X86::RCPSSm: + case X86::RCPSSr_Int: + case X86::RCPSSm_Int: + case X86::ROUNDSDr: + case X86::ROUNDSDm: + case X86::ROUNDSDr_Int: + case X86::ROUNDSSr: + case X86::ROUNDSSm: + case X86::ROUNDSSr_Int: + case X86::RSQRTSSr: + case X86::RSQRTSSm: + case X86::RSQRTSSr_Int: + case X86::RSQRTSSm_Int: + case X86::SQRTSSr: + case X86::SQRTSSm: + case X86::SQRTSSr_Int: + case X86::SQRTSSm_Int: + case X86::SQRTSDr: + case X86::SQRTSDm: + case X86::SQRTSDr_Int: + case X86::SQRTSDm_Int: + return true; + } + + return false; +} + +/// Inform the ExeDepsFix pass how many idle +/// instructions we would like before a partial register update. +unsigned X86InstrInfo:: +getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const { + if (OpNum != 0 || !hasPartialRegUpdate(MI->getOpcode())) + return 0; + + // If MI is marked as reading Reg, the partial register update is wanted. + const MachineOperand &MO = MI->getOperand(0); + unsigned Reg = MO.getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (MO.readsReg() || MI->readsVirtualRegister(Reg)) + return 0; + } else { + if (MI->readsRegister(Reg, TRI)) + return 0; + } + + // If any of the preceding 16 instructions are reading Reg, insert a + // dependency breaking instruction. The magic number is based on a few + // Nehalem experiments. + return 16; +} + +// Return true for any instruction the copies the high bits of the first source +// operand into the unused high bits of the destination operand. +static bool hasUndefRegUpdate(unsigned Opcode) { + switch (Opcode) { + case X86::VCVTSI2SSrr: + case X86::VCVTSI2SSrm: + case X86::Int_VCVTSI2SSrr: + case X86::Int_VCVTSI2SSrm: + case X86::VCVTSI2SS64rr: + case X86::VCVTSI2SS64rm: + case X86::Int_VCVTSI2SS64rr: + case X86::Int_VCVTSI2SS64rm: + case X86::VCVTSI2SDrr: + case X86::VCVTSI2SDrm: + case X86::Int_VCVTSI2SDrr: + case X86::Int_VCVTSI2SDrm: + case X86::VCVTSI2SD64rr: + case X86::VCVTSI2SD64rm: + case X86::Int_VCVTSI2SD64rr: + case X86::Int_VCVTSI2SD64rm: + case X86::VCVTSD2SSrr: + case X86::VCVTSD2SSrm: + case X86::Int_VCVTSD2SSrr: + case X86::Int_VCVTSD2SSrm: + case X86::VCVTSS2SDrr: + case X86::VCVTSS2SDrm: + case X86::Int_VCVTSS2SDrr: + case X86::Int_VCVTSS2SDrm: + case X86::VRCPSSr: + case X86::VRCPSSm: + case X86::VRCPSSm_Int: + case X86::VROUNDSDr: + case X86::VROUNDSDm: + case X86::VROUNDSDr_Int: + case X86::VROUNDSSr: + case X86::VROUNDSSm: + case X86::VROUNDSSr_Int: + case X86::VRSQRTSSr: + case X86::VRSQRTSSm: + case X86::VRSQRTSSm_Int: + case X86::VSQRTSSr: + case X86::VSQRTSSm: + case X86::VSQRTSSm_Int: + case X86::VSQRTSDr: + case X86::VSQRTSDm: + case X86::VSQRTSDm_Int: + // AVX-512 + case X86::VCVTSD2SSZrr: + case X86::VCVTSD2SSZrm: + case X86::VCVTSS2SDZrr: + case X86::VCVTSS2SDZrm: + return true; + } + + return false; +} + +/// Inform the ExeDepsFix pass how many idle instructions we would like before +/// certain undef register reads. +/// +/// This catches the VCVTSI2SD family of instructions: +/// +/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14 +/// +/// We should to be careful *not* to catch VXOR idioms which are presumably +/// handled specially in the pipeline: +/// +/// vxorps %xmm1<undef>, %xmm1<undef>, %xmm1 +/// +/// Like getPartialRegUpdateClearance, this makes a strong assumption that the +/// high bits that are passed-through are not live. +unsigned X86InstrInfo:: +getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum, + const TargetRegisterInfo *TRI) const { + if (!hasUndefRegUpdate(MI->getOpcode())) + return 0; + + // Set the OpNum parameter to the first source operand. + OpNum = 1; + + const MachineOperand &MO = MI->getOperand(OpNum); + if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { + // Use the same magic number as getPartialRegUpdateClearance. + return 16; + } + return 0; +} + +void X86InstrInfo:: +breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const { + unsigned Reg = MI->getOperand(OpNum).getReg(); + // If MI kills this register, the false dependence is already broken. + if (MI->killsRegister(Reg, TRI)) + return; + + if (X86::VR128RegClass.contains(Reg)) { + // These instructions are all floating point domain, so xorps is the best + // choice. + unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr; + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg) + .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); + MI->addRegisterKilled(Reg, TRI, true); + } else if (X86::VR256RegClass.contains(Reg)) { + // Use vxorps to clear the full ymm register. + // It wants to read and write the xmm sub-register. + unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg) + .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef) + .addReg(Reg, RegState::ImplicitDefine); + MI->addRegisterKilled(Reg, TRI, true); + } +} + +MachineInstr *X86InstrInfo::foldMemoryOperandImpl( + MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, int FrameIndex) const { + // Check switch flag + if (NoFusing) + return nullptr; + + // Unless optimizing for size, don't fold to avoid partial + // register update stalls + if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode())) + return nullptr; + + const MachineFrameInfo *MFI = MF.getFrameInfo(); + unsigned Size = MFI->getObjectSize(FrameIndex); + unsigned Alignment = MFI->getObjectAlignment(FrameIndex); + // If the function stack isn't realigned we don't want to fold instructions + // that need increased alignment. + if (!RI.needsStackRealignment(MF)) + Alignment = + std::min(Alignment, Subtarget.getFrameLowering()->getStackAlignment()); + if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { + unsigned NewOpc = 0; + unsigned RCSize = 0; + switch (MI->getOpcode()) { + default: return nullptr; + case X86::TEST8rr: NewOpc = X86::CMP8ri; RCSize = 1; break; + case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break; + case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break; + case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break; + } + // Check if it's safe to fold the load. If the size of the object is + // narrower than the load width, then it's not. + if (Size < RCSize) + return nullptr; + // Change to CMPXXri r, 0 first. + MI->setDesc(get(NewOpc)); + MI->getOperand(1).ChangeToImmediate(0); + } else if (Ops.size() != 1) + return nullptr; + + return foldMemoryOperandImpl(MF, MI, Ops[0], + MachineOperand::CreateFI(FrameIndex), InsertPt, + Size, Alignment, /*AllowCommute=*/true); +} + +/// Check if \p LoadMI is a partial register load that we can't fold into \p MI +/// because the latter uses contents that wouldn't be defined in the folded +/// version. For instance, this transformation isn't legal: +/// movss (%rdi), %xmm0 +/// addps %xmm0, %xmm0 +/// -> +/// addps (%rdi), %xmm0 +/// +/// But this one is: +/// movss (%rdi), %xmm0 +/// addss %xmm0, %xmm0 +/// -> +/// addss (%rdi), %xmm0 +/// +static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, + const MachineInstr &UserMI, + const MachineFunction &MF) { + unsigned Opc = LoadMI.getOpcode(); + unsigned UserOpc = UserMI.getOpcode(); + unsigned RegSize = + MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize(); + + if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) { + // These instructions only load 32 bits, we can't fold them if the + // destination register is wider than 32 bits (4 bytes), and its user + // instruction isn't scalar (SS). + switch (UserOpc) { + case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: + case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: + case X86::MULSSrr_Int: case X86::VMULSSrr_Int: + case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: + case X86::VFMADDSSr132r_Int: case X86::VFNMADDSSr132r_Int: + case X86::VFMADDSSr213r_Int: case X86::VFNMADDSSr213r_Int: + case X86::VFMADDSSr231r_Int: case X86::VFNMADDSSr231r_Int: + case X86::VFMSUBSSr132r_Int: case X86::VFNMSUBSSr132r_Int: + case X86::VFMSUBSSr213r_Int: case X86::VFNMSUBSSr213r_Int: + case X86::VFMSUBSSr231r_Int: case X86::VFNMSUBSSr231r_Int: + return false; + default: + return true; + } + } + + if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) { + // These instructions only load 64 bits, we can't fold them if the + // destination register is wider than 64 bits (8 bytes), and its user + // instruction isn't scalar (SD). + switch (UserOpc) { + case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: + case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: + case X86::MULSDrr_Int: case X86::VMULSDrr_Int: + case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: + case X86::VFMADDSDr132r_Int: case X86::VFNMADDSDr132r_Int: + case X86::VFMADDSDr213r_Int: case X86::VFNMADDSDr213r_Int: + case X86::VFMADDSDr231r_Int: case X86::VFNMADDSDr231r_Int: + case X86::VFMSUBSDr132r_Int: case X86::VFNMSUBSDr132r_Int: + case X86::VFMSUBSDr213r_Int: case X86::VFNMSUBSDr213r_Int: + case X86::VFMSUBSDr231r_Int: case X86::VFNMSUBSDr231r_Int: + return false; + default: + return true; + } + } + + return false; +} + +MachineInstr *X86InstrInfo::foldMemoryOperandImpl( + MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const { + // If loading from a FrameIndex, fold directly from the FrameIndex. + unsigned NumOps = LoadMI->getDesc().getNumOperands(); + int FrameIndex; + if (isLoadFromStackSlot(LoadMI, FrameIndex)) { + if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF)) + return nullptr; + return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex); + } + + // Check switch flag + if (NoFusing) return nullptr; + + // Avoid partial register update stalls unless optimizing for size. + if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode())) + return nullptr; + + // Determine the alignment of the load. + unsigned Alignment = 0; + if (LoadMI->hasOneMemOperand()) + Alignment = (*LoadMI->memoperands_begin())->getAlignment(); + else + switch (LoadMI->getOpcode()) { + case X86::AVX2_SETALLONES: + case X86::AVX_SET0: + Alignment = 32; + break; + case X86::V_SET0: + case X86::V_SETALLONES: + Alignment = 16; + break; + case X86::FsFLD0SD: + Alignment = 8; + break; + case X86::FsFLD0SS: + Alignment = 4; + break; + default: + return nullptr; + } + if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { + unsigned NewOpc = 0; + switch (MI->getOpcode()) { + default: return nullptr; + case X86::TEST8rr: NewOpc = X86::CMP8ri; break; + case X86::TEST16rr: NewOpc = X86::CMP16ri8; break; + case X86::TEST32rr: NewOpc = X86::CMP32ri8; break; + case X86::TEST64rr: NewOpc = X86::CMP64ri8; break; + } + // Change to CMPXXri r, 0 first. + MI->setDesc(get(NewOpc)); + MI->getOperand(1).ChangeToImmediate(0); + } else if (Ops.size() != 1) + return nullptr; + + // Make sure the subregisters match. + // Otherwise we risk changing the size of the load. + if (LoadMI->getOperand(0).getSubReg() != MI->getOperand(Ops[0]).getSubReg()) + return nullptr; + + SmallVector<MachineOperand,X86::AddrNumOperands> MOs; + switch (LoadMI->getOpcode()) { + case X86::V_SET0: + case X86::V_SETALLONES: + case X86::AVX2_SETALLONES: + case X86::AVX_SET0: + case X86::FsFLD0SD: + case X86::FsFLD0SS: { + // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. + // Create a constant-pool entry and operands to load from it. + + // Medium and large mode can't fold loads this way. + if (MF.getTarget().getCodeModel() != CodeModel::Small && + MF.getTarget().getCodeModel() != CodeModel::Kernel) + return nullptr; + + // x86-32 PIC requires a PIC base register for constant pools. + unsigned PICBase = 0; + if (MF.getTarget().getRelocationModel() == Reloc::PIC_) { + if (Subtarget.is64Bit()) + PICBase = X86::RIP; + else + // FIXME: PICBase = getGlobalBaseReg(&MF); + // This doesn't work for several reasons. + // 1. GlobalBaseReg may have been spilled. + // 2. It may not be live at MI. + return nullptr; + } + + // Create a constant-pool entry. + MachineConstantPool &MCP = *MF.getConstantPool(); + Type *Ty; + unsigned Opc = LoadMI->getOpcode(); + if (Opc == X86::FsFLD0SS) + Ty = Type::getFloatTy(MF.getFunction()->getContext()); + else if (Opc == X86::FsFLD0SD) + Ty = Type::getDoubleTy(MF.getFunction()->getContext()); + else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0) + Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8); + else + Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4); + + bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES); + const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) : + Constant::getNullValue(Ty); + unsigned CPI = MCP.getConstantPoolIndex(C, Alignment); + + // Create operands to load from the constant pool entry. + MOs.push_back(MachineOperand::CreateReg(PICBase, false)); + MOs.push_back(MachineOperand::CreateImm(1)); + MOs.push_back(MachineOperand::CreateReg(0, false)); + MOs.push_back(MachineOperand::CreateCPI(CPI, 0)); + MOs.push_back(MachineOperand::CreateReg(0, false)); + break; + } + default: { + if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF)) + return nullptr; + + // Folding a normal load. Just copy the load's address operands. + MOs.append(LoadMI->operands_begin() + NumOps - X86::AddrNumOperands, + LoadMI->operands_begin() + NumOps); + break; + } + } + return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt, + /*Size=*/0, Alignment, /*AllowCommute=*/true); +} + +bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, + unsigned Reg, bool UnfoldLoad, bool UnfoldStore, + SmallVectorImpl<MachineInstr*> &NewMIs) const { + DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I = + MemOp2RegOpTable.find(MI->getOpcode()); + if (I == MemOp2RegOpTable.end()) + return false; + unsigned Opc = I->second.first; + unsigned Index = I->second.second & TB_INDEX_MASK; + bool FoldedLoad = I->second.second & TB_FOLDED_LOAD; + bool FoldedStore = I->second.second & TB_FOLDED_STORE; + if (UnfoldLoad && !FoldedLoad) + return false; + UnfoldLoad &= FoldedLoad; + if (UnfoldStore && !FoldedStore) + return false; + UnfoldStore &= FoldedStore; + + const MCInstrDesc &MCID = get(Opc); + const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); + // TODO: Check if 32-byte or greater accesses are slow too? + if (!MI->hasOneMemOperand() && + RC == &X86::VR128RegClass && + Subtarget.isUnalignedMem16Slow()) + // Without memoperands, loadRegFromAddr and storeRegToStackSlot will + // conservatively assume the address is unaligned. That's bad for + // performance. + return false; + SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps; + SmallVector<MachineOperand,2> BeforeOps; + SmallVector<MachineOperand,2> AfterOps; + SmallVector<MachineOperand,4> ImpOps; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &Op = MI->getOperand(i); + if (i >= Index && i < Index + X86::AddrNumOperands) + AddrOps.push_back(Op); + else if (Op.isReg() && Op.isImplicit()) + ImpOps.push_back(Op); + else if (i < Index) + BeforeOps.push_back(Op); + else if (i > Index) + AfterOps.push_back(Op); + } + + // Emit the load instruction. + if (UnfoldLoad) { + std::pair<MachineInstr::mmo_iterator, + MachineInstr::mmo_iterator> MMOs = + MF.extractLoadMemRefs(MI->memoperands_begin(), + MI->memoperands_end()); + loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs); + if (UnfoldStore) { + // Address operands cannot be marked isKill. + for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) { + MachineOperand &MO = NewMIs[0]->getOperand(i); + if (MO.isReg()) + MO.setIsKill(false); + } + } + } + + // Emit the data processing instruction. + MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI->getDebugLoc(), true); + MachineInstrBuilder MIB(MF, DataMI); + + if (FoldedStore) + MIB.addReg(Reg, RegState::Define); + for (MachineOperand &BeforeOp : BeforeOps) + MIB.addOperand(BeforeOp); + if (FoldedLoad) + MIB.addReg(Reg); + for (MachineOperand &AfterOp : AfterOps) + MIB.addOperand(AfterOp); + for (MachineOperand &ImpOp : ImpOps) { + MIB.addReg(ImpOp.getReg(), + getDefRegState(ImpOp.isDef()) | + RegState::Implicit | + getKillRegState(ImpOp.isKill()) | + getDeadRegState(ImpOp.isDead()) | + getUndefRegState(ImpOp.isUndef())); + } + // Change CMP32ri r, 0 back to TEST32rr r, r, etc. + switch (DataMI->getOpcode()) { + default: break; + case X86::CMP64ri32: + case X86::CMP64ri8: + case X86::CMP32ri: + case X86::CMP32ri8: + case X86::CMP16ri: + case X86::CMP16ri8: + case X86::CMP8ri: { + MachineOperand &MO0 = DataMI->getOperand(0); + MachineOperand &MO1 = DataMI->getOperand(1); + if (MO1.getImm() == 0) { + unsigned NewOpc; + switch (DataMI->getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::CMP64ri8: + case X86::CMP64ri32: NewOpc = X86::TEST64rr; break; + case X86::CMP32ri8: + case X86::CMP32ri: NewOpc = X86::TEST32rr; break; + case X86::CMP16ri8: + case X86::CMP16ri: NewOpc = X86::TEST16rr; break; + case X86::CMP8ri: NewOpc = X86::TEST8rr; break; + } + DataMI->setDesc(get(NewOpc)); + MO1.ChangeToRegister(MO0.getReg(), false); + } + } + } + NewMIs.push_back(DataMI); + + // Emit the store instruction. + if (UnfoldStore) { + const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF); + std::pair<MachineInstr::mmo_iterator, + MachineInstr::mmo_iterator> MMOs = + MF.extractStoreMemRefs(MI->memoperands_begin(), + MI->memoperands_end()); + storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs); + } + + return true; +} + +bool +X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, + SmallVectorImpl<SDNode*> &NewNodes) const { + if (!N->isMachineOpcode()) + return false; + + DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I = + MemOp2RegOpTable.find(N->getMachineOpcode()); + if (I == MemOp2RegOpTable.end()) + return false; + unsigned Opc = I->second.first; + unsigned Index = I->second.second & TB_INDEX_MASK; + bool FoldedLoad = I->second.second & TB_FOLDED_LOAD; + bool FoldedStore = I->second.second & TB_FOLDED_STORE; + const MCInstrDesc &MCID = get(Opc); + MachineFunction &MF = DAG.getMachineFunction(); + const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); + unsigned NumDefs = MCID.NumDefs; + std::vector<SDValue> AddrOps; + std::vector<SDValue> BeforeOps; + std::vector<SDValue> AfterOps; + SDLoc dl(N); + unsigned NumOps = N->getNumOperands(); + for (unsigned i = 0; i != NumOps-1; ++i) { + SDValue Op = N->getOperand(i); + if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands) + AddrOps.push_back(Op); + else if (i < Index-NumDefs) + BeforeOps.push_back(Op); + else if (i > Index-NumDefs) + AfterOps.push_back(Op); + } + SDValue Chain = N->getOperand(NumOps-1); + AddrOps.push_back(Chain); + + // Emit the load instruction. + SDNode *Load = nullptr; + if (FoldedLoad) { + EVT VT = *RC->vt_begin(); + std::pair<MachineInstr::mmo_iterator, + MachineInstr::mmo_iterator> MMOs = + MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(), + cast<MachineSDNode>(N)->memoperands_end()); + if (!(*MMOs.first) && + RC == &X86::VR128RegClass && + Subtarget.isUnalignedMem16Slow()) + // Do not introduce a slow unaligned load. + return false; + // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte + // memory access is slow above. + unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + bool isAligned = (*MMOs.first) && + (*MMOs.first)->getAlignment() >= Alignment; + Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl, + VT, MVT::Other, AddrOps); + NewNodes.push_back(Load); + + // Preserve memory reference information. + cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second); + } + + // Emit the data processing instruction. + std::vector<EVT> VTs; + const TargetRegisterClass *DstRC = nullptr; + if (MCID.getNumDefs() > 0) { + DstRC = getRegClass(MCID, 0, &RI, MF); + VTs.push_back(*DstRC->vt_begin()); + } + for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) { + EVT VT = N->getValueType(i); + if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs()) + VTs.push_back(VT); + } + if (Load) + BeforeOps.push_back(SDValue(Load, 0)); + BeforeOps.insert(BeforeOps.end(), AfterOps.begin(), AfterOps.end()); + SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps); + NewNodes.push_back(NewNode); + + // Emit the store instruction. + if (FoldedStore) { + AddrOps.pop_back(); + AddrOps.push_back(SDValue(NewNode, 0)); + AddrOps.push_back(Chain); + std::pair<MachineInstr::mmo_iterator, + MachineInstr::mmo_iterator> MMOs = + MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(), + cast<MachineSDNode>(N)->memoperands_end()); + if (!(*MMOs.first) && + RC == &X86::VR128RegClass && + Subtarget.isUnalignedMem16Slow()) + // Do not introduce a slow unaligned store. + return false; + // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte + // memory access is slow above. + unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + bool isAligned = (*MMOs.first) && + (*MMOs.first)->getAlignment() >= Alignment; + SDNode *Store = + DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget), + dl, MVT::Other, AddrOps); + NewNodes.push_back(Store); + + // Preserve memory reference information. + cast<MachineSDNode>(Store)->setMemRefs(MMOs.first, MMOs.second); + } + + return true; +} + +unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, + bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex) const { + DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I = + MemOp2RegOpTable.find(Opc); + if (I == MemOp2RegOpTable.end()) + return 0; + bool FoldedLoad = I->second.second & TB_FOLDED_LOAD; + bool FoldedStore = I->second.second & TB_FOLDED_STORE; + if (UnfoldLoad && !FoldedLoad) + return 0; + if (UnfoldStore && !FoldedStore) + return 0; + if (LoadRegIndex) + *LoadRegIndex = I->second.second & TB_INDEX_MASK; + return I->second.first; +} + +bool +X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, int64_t &Offset2) const { + if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode()) + return false; + unsigned Opc1 = Load1->getMachineOpcode(); + unsigned Opc2 = Load2->getMachineOpcode(); + switch (Opc1) { + default: return false; + case X86::MOV8rm: + case X86::MOV16rm: + case X86::MOV32rm: + case X86::MOV64rm: + case X86::LD_Fp32m: + case X86::LD_Fp64m: + case X86::LD_Fp80m: + case X86::MOVSSrm: + case X86::MOVSDrm: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + case X86::FsMOVAPSrm: + case X86::FsMOVAPDrm: + case X86::MOVAPSrm: + case X86::MOVUPSrm: + case X86::MOVAPDrm: + case X86::MOVDQArm: + case X86::MOVDQUrm: + // AVX load instructions + case X86::VMOVSSrm: + case X86::VMOVSDrm: + case X86::FsVMOVAPSrm: + case X86::FsVMOVAPDrm: + case X86::VMOVAPSrm: + case X86::VMOVUPSrm: + case X86::VMOVAPDrm: + case X86::VMOVDQArm: + case X86::VMOVDQUrm: + case X86::VMOVAPSYrm: + case X86::VMOVUPSYrm: + case X86::VMOVAPDYrm: + case X86::VMOVDQAYrm: + case X86::VMOVDQUYrm: + break; + } + switch (Opc2) { + default: return false; + case X86::MOV8rm: + case X86::MOV16rm: + case X86::MOV32rm: + case X86::MOV64rm: + case X86::LD_Fp32m: + case X86::LD_Fp64m: + case X86::LD_Fp80m: + case X86::MOVSSrm: + case X86::MOVSDrm: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + case X86::FsMOVAPSrm: + case X86::FsMOVAPDrm: + case X86::MOVAPSrm: + case X86::MOVUPSrm: + case X86::MOVAPDrm: + case X86::MOVDQArm: + case X86::MOVDQUrm: + // AVX load instructions + case X86::VMOVSSrm: + case X86::VMOVSDrm: + case X86::FsVMOVAPSrm: + case X86::FsVMOVAPDrm: + case X86::VMOVAPSrm: + case X86::VMOVUPSrm: + case X86::VMOVAPDrm: + case X86::VMOVDQArm: + case X86::VMOVDQUrm: + case X86::VMOVAPSYrm: + case X86::VMOVUPSYrm: + case X86::VMOVAPDYrm: + case X86::VMOVDQAYrm: + case X86::VMOVDQUYrm: + break; + } + + // Check if chain operands and base addresses match. + if (Load1->getOperand(0) != Load2->getOperand(0) || + Load1->getOperand(5) != Load2->getOperand(5)) + return false; + // Segment operands should match as well. + if (Load1->getOperand(4) != Load2->getOperand(4)) + return false; + // Scale should be 1, Index should be Reg0. + if (Load1->getOperand(1) == Load2->getOperand(1) && + Load1->getOperand(2) == Load2->getOperand(2)) { + if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1) + return false; + + // Now let's examine the displacements. + if (isa<ConstantSDNode>(Load1->getOperand(3)) && + isa<ConstantSDNode>(Load2->getOperand(3))) { + Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue(); + Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue(); + return true; + } + } + return false; +} + +bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const { + assert(Offset2 > Offset1); + if ((Offset2 - Offset1) / 8 > 64) + return false; + + unsigned Opc1 = Load1->getMachineOpcode(); + unsigned Opc2 = Load2->getMachineOpcode(); + if (Opc1 != Opc2) + return false; // FIXME: overly conservative? + + switch (Opc1) { + default: break; + case X86::LD_Fp32m: + case X86::LD_Fp64m: + case X86::LD_Fp80m: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + return false; + } + + EVT VT = Load1->getValueType(0); + switch (VT.getSimpleVT().SimpleTy) { + default: + // XMM registers. In 64-bit mode we can be a bit more aggressive since we + // have 16 of them to play with. + if (Subtarget.is64Bit()) { + if (NumLoads >= 3) + return false; + } else if (NumLoads) { + return false; + } + break; + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::i64: + case MVT::f32: + case MVT::f64: + if (NumLoads) + return false; + break; + } + + return true; +} + +bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First, + MachineInstr *Second) const { + // Check if this processor supports macro-fusion. Since this is a minor + // heuristic, we haven't specifically reserved a feature. hasAVX is a decent + // proxy for SandyBridge+. + if (!Subtarget.hasAVX()) + return false; + + enum { + FuseTest, + FuseCmp, + FuseInc + } FuseKind; + + switch(Second->getOpcode()) { + default: + return false; + case X86::JE_1: + case X86::JNE_1: + case X86::JL_1: + case X86::JLE_1: + case X86::JG_1: + case X86::JGE_1: + FuseKind = FuseInc; + break; + case X86::JB_1: + case X86::JBE_1: + case X86::JA_1: + case X86::JAE_1: + FuseKind = FuseCmp; + break; + case X86::JS_1: + case X86::JNS_1: + case X86::JP_1: + case X86::JNP_1: + case X86::JO_1: + case X86::JNO_1: + FuseKind = FuseTest; + break; + } + switch (First->getOpcode()) { + default: + return false; + case X86::TEST8rr: + case X86::TEST16rr: + case X86::TEST32rr: + case X86::TEST64rr: + case X86::TEST8ri: + case X86::TEST16ri: + case X86::TEST32ri: + case X86::TEST32i32: + case X86::TEST64i32: + case X86::TEST64ri32: + case X86::TEST8rm: + case X86::TEST16rm: + case X86::TEST32rm: + case X86::TEST64rm: + case X86::TEST8ri_NOREX: + case X86::AND16i16: + case X86::AND16ri: + case X86::AND16ri8: + case X86::AND16rm: + case X86::AND16rr: + case X86::AND32i32: + case X86::AND32ri: + case X86::AND32ri8: + case X86::AND32rm: + case X86::AND32rr: + case X86::AND64i32: + case X86::AND64ri32: + case X86::AND64ri8: + case X86::AND64rm: + case X86::AND64rr: + case X86::AND8i8: + case X86::AND8ri: + case X86::AND8rm: + case X86::AND8rr: + return true; + case X86::CMP16i16: + case X86::CMP16ri: + case X86::CMP16ri8: + case X86::CMP16rm: + case X86::CMP16rr: + case X86::CMP32i32: + case X86::CMP32ri: + case X86::CMP32ri8: + case X86::CMP32rm: + case X86::CMP32rr: + case X86::CMP64i32: + case X86::CMP64ri32: + case X86::CMP64ri8: + case X86::CMP64rm: + case X86::CMP64rr: + case X86::CMP8i8: + case X86::CMP8ri: + case X86::CMP8rm: + case X86::CMP8rr: + case X86::ADD16i16: + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri8_DB: + case X86::ADD16ri_DB: + case X86::ADD16rm: + case X86::ADD16rr: + case X86::ADD16rr_DB: + case X86::ADD32i32: + case X86::ADD32ri: + case X86::ADD32ri8: + case X86::ADD32ri8_DB: + case X86::ADD32ri_DB: + case X86::ADD32rm: + case X86::ADD32rr: + case X86::ADD32rr_DB: + case X86::ADD64i32: + case X86::ADD64ri32: + case X86::ADD64ri32_DB: + case X86::ADD64ri8: + case X86::ADD64ri8_DB: + case X86::ADD64rm: + case X86::ADD64rr: + case X86::ADD64rr_DB: + case X86::ADD8i8: + case X86::ADD8mi: + case X86::ADD8mr: + case X86::ADD8ri: + case X86::ADD8rm: + case X86::ADD8rr: + case X86::SUB16i16: + case X86::SUB16ri: + case X86::SUB16ri8: + case X86::SUB16rm: + case X86::SUB16rr: + case X86::SUB32i32: + case X86::SUB32ri: + case X86::SUB32ri8: + case X86::SUB32rm: + case X86::SUB32rr: + case X86::SUB64i32: + case X86::SUB64ri32: + case X86::SUB64ri8: + case X86::SUB64rm: + case X86::SUB64rr: + case X86::SUB8i8: + case X86::SUB8ri: + case X86::SUB8rm: + case X86::SUB8rr: + return FuseKind == FuseCmp || FuseKind == FuseInc; + case X86::INC16r: + case X86::INC32r: + case X86::INC64r: + case X86::INC8r: + case X86::DEC16r: + case X86::DEC32r: + case X86::DEC64r: + case X86::DEC8r: + return FuseKind == FuseInc; + } +} + +bool X86InstrInfo:: +ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { + assert(Cond.size() == 1 && "Invalid X86 branch condition!"); + X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm()); + if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E) + return true; + Cond[0].setImm(GetOppositeBranchCondition(CC)); + return false; +} + +bool X86InstrInfo:: +isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { + // FIXME: Return false for x87 stack register classes for now. We can't + // allow any loads of these registers before FpGet_ST0_80. + return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass || + RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass); +} + +/// Return a virtual register initialized with the +/// the global base register value. Output instructions required to +/// initialize the register in the function entry block, if necessary. +/// +/// TODO: Eliminate this and move the code to X86MachineFunctionInfo. +/// +unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const { + assert(!Subtarget.is64Bit() && + "X86-64 PIC uses RIP relative addressing"); + + X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); + unsigned GlobalBaseReg = X86FI->getGlobalBaseReg(); + if (GlobalBaseReg != 0) + return GlobalBaseReg; + + // Create the register. The code to initialize it is inserted + // later, by the CGBR pass (below). + MachineRegisterInfo &RegInfo = MF->getRegInfo(); + GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); + X86FI->setGlobalBaseReg(GlobalBaseReg); + return GlobalBaseReg; +} + +// These are the replaceable SSE instructions. Some of these have Int variants +// that we don't include here. We don't want to replace instructions selected +// by intrinsics. +static const uint16_t ReplaceableInstrs[][3] = { + //PackedSingle PackedDouble PackedInt + { X86::MOVAPSmr, X86::MOVAPDmr, X86::MOVDQAmr }, + { X86::MOVAPSrm, X86::MOVAPDrm, X86::MOVDQArm }, + { X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr }, + { X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr }, + { X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm }, + { X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr }, + { X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr }, + { X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm }, + { X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr }, + { X86::ANDPSrm, X86::ANDPDrm, X86::PANDrm }, + { X86::ANDPSrr, X86::ANDPDrr, X86::PANDrr }, + { X86::ORPSrm, X86::ORPDrm, X86::PORrm }, + { X86::ORPSrr, X86::ORPDrr, X86::PORrr }, + { X86::XORPSrm, X86::XORPDrm, X86::PXORrm }, + { X86::XORPSrr, X86::XORPDrr, X86::PXORrr }, + // AVX 128-bit support + { X86::VMOVAPSmr, X86::VMOVAPDmr, X86::VMOVDQAmr }, + { X86::VMOVAPSrm, X86::VMOVAPDrm, X86::VMOVDQArm }, + { X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr }, + { X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr }, + { X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm }, + { X86::VMOVLPSmr, X86::VMOVLPDmr, X86::VMOVPQI2QImr }, + { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr }, + { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm }, + { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr }, + { X86::VANDPSrm, X86::VANDPDrm, X86::VPANDrm }, + { X86::VANDPSrr, X86::VANDPDrr, X86::VPANDrr }, + { X86::VORPSrm, X86::VORPDrm, X86::VPORrm }, + { X86::VORPSrr, X86::VORPDrr, X86::VPORrr }, + { X86::VXORPSrm, X86::VXORPDrm, X86::VPXORrm }, + { X86::VXORPSrr, X86::VXORPDrr, X86::VPXORrr }, + // AVX 256-bit support + { X86::VMOVAPSYmr, X86::VMOVAPDYmr, X86::VMOVDQAYmr }, + { X86::VMOVAPSYrm, X86::VMOVAPDYrm, X86::VMOVDQAYrm }, + { X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr }, + { X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr }, + { X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm }, + { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr } +}; + +static const uint16_t ReplaceableInstrsAVX2[][3] = { + //PackedSingle PackedDouble PackedInt + { X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNYrm }, + { X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNYrr }, + { X86::VANDPSYrm, X86::VANDPDYrm, X86::VPANDYrm }, + { X86::VANDPSYrr, X86::VANDPDYrr, X86::VPANDYrr }, + { X86::VORPSYrm, X86::VORPDYrm, X86::VPORYrm }, + { X86::VORPSYrr, X86::VORPDYrr, X86::VPORYrr }, + { X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORYrm }, + { X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORYrr }, + { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr }, + { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr }, + { X86::VINSERTF128rm, X86::VINSERTF128rm, X86::VINSERTI128rm }, + { X86::VINSERTF128rr, X86::VINSERTF128rr, X86::VINSERTI128rr }, + { X86::VPERM2F128rm, X86::VPERM2F128rm, X86::VPERM2I128rm }, + { X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr }, + { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm}, + { X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr}, + { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr}, + { X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm}, + { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr}, + { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm} +}; + +// FIXME: Some shuffle and unpack instructions have equivalents in different +// domains, but they require a bit more work than just switching opcodes. + +static const uint16_t *lookup(unsigned opcode, unsigned domain) { + for (const uint16_t (&Row)[3] : ReplaceableInstrs) + if (Row[domain-1] == opcode) + return Row; + return nullptr; +} + +static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { + for (const uint16_t (&Row)[3] : ReplaceableInstrsAVX2) + if (Row[domain-1] == opcode) + return Row; + return nullptr; +} + +std::pair<uint16_t, uint16_t> +X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const { + uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3; + bool hasAVX2 = Subtarget.hasAVX2(); + uint16_t validDomains = 0; + if (domain && lookup(MI->getOpcode(), domain)) + validDomains = 0xe; + else if (domain && lookupAVX2(MI->getOpcode(), domain)) + validDomains = hasAVX2 ? 0xe : 0x6; + return std::make_pair(domain, validDomains); +} + +void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { + assert(Domain>0 && Domain<4 && "Invalid execution domain"); + uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3; + assert(dom && "Not an SSE instruction"); + const uint16_t *table = lookup(MI->getOpcode(), dom); + if (!table) { // try the other table + assert((Subtarget.hasAVX2() || Domain < 3) && + "256-bit vector operations only available in AVX2"); + table = lookupAVX2(MI->getOpcode(), dom); + } + assert(table && "Cannot change domain"); + MI->setDesc(get(table[Domain-1])); +} + +/// Return the noop instruction to use for a noop. +void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { + NopInst.setOpcode(X86::NOOP); +} + +// This code must remain in sync with getJumpInstrTableEntryBound in this class! +// In particular, getJumpInstrTableEntryBound must always return an upper bound +// on the encoding lengths of the instructions generated by +// getUnconditionalBranch and getTrap. +void X86InstrInfo::getUnconditionalBranch( + MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const { + Branch.setOpcode(X86::JMP_1); + Branch.addOperand(MCOperand::createExpr(BranchTarget)); +} + +// This code must remain in sync with getJumpInstrTableEntryBound in this class! +// In particular, getJumpInstrTableEntryBound must always return an upper bound +// on the encoding lengths of the instructions generated by +// getUnconditionalBranch and getTrap. +void X86InstrInfo::getTrap(MCInst &MI) const { + MI.setOpcode(X86::TRAP); +} + +// See getTrap and getUnconditionalBranch for conditions on the value returned +// by this function. +unsigned X86InstrInfo::getJumpInstrTableEntryBound() const { + // 5 bytes suffice: JMP_4 Symbol@PLT is uses 1 byte (E9) for the JMP_4 and 4 + // bytes for the symbol offset. And TRAP is ud2, which is two bytes (0F 0B). + return 5; +} + +bool X86InstrInfo::isHighLatencyDef(int opc) const { + switch (opc) { + default: return false; + case X86::DIVSDrm: + case X86::DIVSDrm_Int: + case X86::DIVSDrr: + case X86::DIVSDrr_Int: + case X86::DIVSSrm: + case X86::DIVSSrm_Int: + case X86::DIVSSrr: + case X86::DIVSSrr_Int: + case X86::SQRTPDm: + case X86::SQRTPDr: + case X86::SQRTPSm: + case X86::SQRTPSr: + case X86::SQRTSDm: + case X86::SQRTSDm_Int: + case X86::SQRTSDr: + case X86::SQRTSDr_Int: + case X86::SQRTSSm: + case X86::SQRTSSm_Int: + case X86::SQRTSSr: + case X86::SQRTSSr_Int: + // AVX instructions with high latency + case X86::VDIVSDrm: + case X86::VDIVSDrm_Int: + case X86::VDIVSDrr: + case X86::VDIVSDrr_Int: + case X86::VDIVSSrm: + case X86::VDIVSSrm_Int: + case X86::VDIVSSrr: + case X86::VDIVSSrr_Int: + case X86::VSQRTPDm: + case X86::VSQRTPDr: + case X86::VSQRTPSm: + case X86::VSQRTPSr: + case X86::VSQRTSDm: + case X86::VSQRTSDm_Int: + case X86::VSQRTSDr: + case X86::VSQRTSSm: + case X86::VSQRTSSm_Int: + case X86::VSQRTSSr: + case X86::VSQRTPDZm: + case X86::VSQRTPDZr: + case X86::VSQRTPSZm: + case X86::VSQRTPSZr: + case X86::VSQRTSDZm: + case X86::VSQRTSDZm_Int: + case X86::VSQRTSDZr: + case X86::VSQRTSSZm_Int: + case X86::VSQRTSSZr: + case X86::VSQRTSSZm: + case X86::VDIVSDZrm: + case X86::VDIVSDZrr: + case X86::VDIVSSZrm: + case X86::VDIVSSZrr: + + case X86::VGATHERQPSZrm: + case X86::VGATHERQPDZrm: + case X86::VGATHERDPDZrm: + case X86::VGATHERDPSZrm: + case X86::VPGATHERQDZrm: + case X86::VPGATHERQQZrm: + case X86::VPGATHERDDZrm: + case X86::VPGATHERDQZrm: + case X86::VSCATTERQPDZmr: + case X86::VSCATTERQPSZmr: + case X86::VSCATTERDPDZmr: + case X86::VSCATTERDPSZmr: + case X86::VPSCATTERQDZmr: + case X86::VPSCATTERQQZmr: + case X86::VPSCATTERDDZmr: + case X86::VPSCATTERDQZmr: + return true; + } +} + +bool X86InstrInfo:: +hasHighOperandLatency(const TargetSchedModel &SchedModel, + const MachineRegisterInfo *MRI, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const { + return isHighLatencyDef(DefMI->getOpcode()); +} + +bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst, + const MachineBasicBlock *MBB) const { + assert((Inst.getNumOperands() == 3 || Inst.getNumOperands() == 4) && + "Reassociation needs binary operators"); + + // Integer binary math/logic instructions have a third source operand: + // the EFLAGS register. That operand must be both defined here and never + // used; ie, it must be dead. If the EFLAGS operand is live, then we can + // not change anything because rearranging the operands could affect other + // instructions that depend on the exact status flags (zero, sign, etc.) + // that are set by using these particular operands with this operation. + if (Inst.getNumOperands() == 4) { + assert(Inst.getOperand(3).isReg() && + Inst.getOperand(3).getReg() == X86::EFLAGS && + "Unexpected operand in reassociable instruction"); + if (!Inst.getOperand(3).isDead()) + return false; + } + + return TargetInstrInfo::hasReassociableOperands(Inst, MBB); +} + +// TODO: There are many more machine instruction opcodes to match: +// 1. Other data types (integer, vectors) +// 2. Other math / logic operations (xor, or) +// 3. Other forms of the same operation (intrinsics and other variants) +bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { + switch (Inst.getOpcode()) { + case X86::AND8rr: + case X86::AND16rr: + case X86::AND32rr: + case X86::AND64rr: + case X86::OR8rr: + case X86::OR16rr: + case X86::OR32rr: + case X86::OR64rr: + case X86::XOR8rr: + case X86::XOR16rr: + case X86::XOR32rr: + case X86::XOR64rr: + case X86::IMUL16rr: + case X86::IMUL32rr: + case X86::IMUL64rr: + case X86::PANDrr: + case X86::PORrr: + case X86::PXORrr: + case X86::VPANDrr: + case X86::VPANDYrr: + case X86::VPORrr: + case X86::VPORYrr: + case X86::VPXORrr: + case X86::VPXORYrr: + // Normal min/max instructions are not commutative because of NaN and signed + // zero semantics, but these are. Thus, there's no need to check for global + // relaxed math; the instructions themselves have the properties we need. + case X86::MAXCPDrr: + case X86::MAXCPSrr: + case X86::MAXCSDrr: + case X86::MAXCSSrr: + case X86::MINCPDrr: + case X86::MINCPSrr: + case X86::MINCSDrr: + case X86::MINCSSrr: + case X86::VMAXCPDrr: + case X86::VMAXCPSrr: + case X86::VMAXCPDYrr: + case X86::VMAXCPSYrr: + case X86::VMAXCSDrr: + case X86::VMAXCSSrr: + case X86::VMINCPDrr: + case X86::VMINCPSrr: + case X86::VMINCPDYrr: + case X86::VMINCPSYrr: + case X86::VMINCSDrr: + case X86::VMINCSSrr: + return true; + case X86::ADDPDrr: + case X86::ADDPSrr: + case X86::ADDSDrr: + case X86::ADDSSrr: + case X86::MULPDrr: + case X86::MULPSrr: + case X86::MULSDrr: + case X86::MULSSrr: + case X86::VADDPDrr: + case X86::VADDPSrr: + case X86::VADDPDYrr: + case X86::VADDPSYrr: + case X86::VADDSDrr: + case X86::VADDSSrr: + case X86::VMULPDrr: + case X86::VMULPSrr: + case X86::VMULPDYrr: + case X86::VMULPSYrr: + case X86::VMULSDrr: + case X86::VMULSSrr: + return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; + default: + return false; + } +} + +/// This is an architecture-specific helper function of reassociateOps. +/// Set special operand attributes for new instructions after reassociation. +void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1, + MachineInstr &OldMI2, + MachineInstr &NewMI1, + MachineInstr &NewMI2) const { + // Integer instructions define an implicit EFLAGS source register operand as + // the third source (fourth total) operand. + if (OldMI1.getNumOperands() != 4 || OldMI2.getNumOperands() != 4) + return; + + assert(NewMI1.getNumOperands() == 4 && NewMI2.getNumOperands() == 4 && + "Unexpected instruction type for reassociation"); + + MachineOperand &OldOp1 = OldMI1.getOperand(3); + MachineOperand &OldOp2 = OldMI2.getOperand(3); + MachineOperand &NewOp1 = NewMI1.getOperand(3); + MachineOperand &NewOp2 = NewMI2.getOperand(3); + + assert(OldOp1.isReg() && OldOp1.getReg() == X86::EFLAGS && OldOp1.isDead() && + "Must have dead EFLAGS operand in reassociable instruction"); + assert(OldOp2.isReg() && OldOp2.getReg() == X86::EFLAGS && OldOp2.isDead() && + "Must have dead EFLAGS operand in reassociable instruction"); + + (void)OldOp1; + (void)OldOp2; + + assert(NewOp1.isReg() && NewOp1.getReg() == X86::EFLAGS && + "Unexpected operand in reassociable instruction"); + assert(NewOp2.isReg() && NewOp2.getReg() == X86::EFLAGS && + "Unexpected operand in reassociable instruction"); + + // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations + // of this pass or other passes. The EFLAGS operands must be dead in these new + // instructions because the EFLAGS operands in the original instructions must + // be dead in order for reassociation to occur. + NewOp1.setIsDead(); + NewOp2.setIsDead(); +} + +std::pair<unsigned, unsigned> +X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { + return std::make_pair(TF, 0u); +} + +ArrayRef<std::pair<unsigned, const char *>> +X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { + using namespace X86II; + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"}, + {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"}, + {MO_GOT, "x86-got"}, + {MO_GOTOFF, "x86-gotoff"}, + {MO_GOTPCREL, "x86-gotpcrel"}, + {MO_PLT, "x86-plt"}, + {MO_TLSGD, "x86-tlsgd"}, + {MO_TLSLD, "x86-tlsld"}, + {MO_TLSLDM, "x86-tlsldm"}, + {MO_GOTTPOFF, "x86-gottpoff"}, + {MO_INDNTPOFF, "x86-indntpoff"}, + {MO_TPOFF, "x86-tpoff"}, + {MO_DTPOFF, "x86-dtpoff"}, + {MO_NTPOFF, "x86-ntpoff"}, + {MO_GOTNTPOFF, "x86-gotntpoff"}, + {MO_DLLIMPORT, "x86-dllimport"}, + {MO_DARWIN_STUB, "x86-darwin-stub"}, + {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"}, + {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"}, + {MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE, "x86-darwin-hidden-nonlazy-pic-base"}, + {MO_TLVP, "x86-tlvp"}, + {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"}, + {MO_SECREL, "x86-secrel"}}; + return makeArrayRef(TargetFlags); +} + +namespace { + /// Create Global Base Reg pass. This initializes the PIC + /// global base register for x86-32. + struct CGBR : public MachineFunctionPass { + static char ID; + CGBR() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override { + const X86TargetMachine *TM = + static_cast<const X86TargetMachine *>(&MF.getTarget()); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + + // Don't do anything if this is 64-bit as 64-bit PIC + // uses RIP relative addressing. + if (STI.is64Bit()) + return false; + + // Only emit a global base reg in PIC mode. + if (TM->getRelocationModel() != Reloc::PIC_) + return false; + + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + unsigned GlobalBaseReg = X86FI->getGlobalBaseReg(); + + // If we didn't need a GlobalBaseReg, don't insert code. + if (GlobalBaseReg == 0) + return false; + + // Insert the set of GlobalBaseReg into the first MBB of the function + MachineBasicBlock &FirstMBB = MF.front(); + MachineBasicBlock::iterator MBBI = FirstMBB.begin(); + DebugLoc DL = FirstMBB.findDebugLoc(MBBI); + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + const X86InstrInfo *TII = STI.getInstrInfo(); + + unsigned PC; + if (STI.isPICStyleGOT()) + PC = RegInfo.createVirtualRegister(&X86::GR32RegClass); + else + PC = GlobalBaseReg; + + // Operand of MovePCtoStack is completely ignored by asm printer. It's + // only used in JIT code emission as displacement to pc. + BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0); + + // If we're using vanilla 'GOT' PIC style, we should use relative addressing + // not to pc, but to _GLOBAL_OFFSET_TABLE_ external. + if (STI.isPICStyleGOT()) { + // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register + BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg) + .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_", + X86II::MO_GOT_ABSOLUTE_ADDRESS); + } + + return true; + } + + const char *getPassName() const override { + return "X86 PIC Global Base Reg Initialization"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + }; +} + +char CGBR::ID = 0; +FunctionPass* +llvm::createX86GlobalBaseRegPass() { return new CGBR(); } + +namespace { + struct LDTLSCleanup : public MachineFunctionPass { + static char ID; + LDTLSCleanup() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override { + X86MachineFunctionInfo* MFI = MF.getInfo<X86MachineFunctionInfo>(); + if (MFI->getNumLocalDynamicTLSAccesses() < 2) { + // No point folding accesses if there isn't at least two. + return false; + } + + MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>(); + return VisitNode(DT->getRootNode(), 0); + } + + // Visit the dominator subtree rooted at Node in pre-order. + // If TLSBaseAddrReg is non-null, then use that to replace any + // TLS_base_addr instructions. Otherwise, create the register + // when the first such instruction is seen, and then use it + // as we encounter more instructions. + bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) { + MachineBasicBlock *BB = Node->getBlock(); + bool Changed = false; + + // Traverse the current block. + for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; + ++I) { + switch (I->getOpcode()) { + case X86::TLS_base_addr32: + case X86::TLS_base_addr64: + if (TLSBaseAddrReg) + I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg); + else + I = SetRegister(I, &TLSBaseAddrReg); + Changed = true; + break; + default: + break; + } + } + + // Visit the children of this block in the dominator tree. + for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end(); + I != E; ++I) { + Changed |= VisitNode(*I, TLSBaseAddrReg); + } + + return Changed; + } + + // Replace the TLS_base_addr instruction I with a copy from + // TLSBaseAddrReg, returning the new instruction. + MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I, + unsigned TLSBaseAddrReg) { + MachineFunction *MF = I->getParent()->getParent(); + const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>(); + const bool is64Bit = STI.is64Bit(); + const X86InstrInfo *TII = STI.getInstrInfo(); + + // Insert a Copy from TLSBaseAddrReg to RAX/EAX. + MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(), + TII->get(TargetOpcode::COPY), + is64Bit ? X86::RAX : X86::EAX) + .addReg(TLSBaseAddrReg); + + // Erase the TLS_base_addr instruction. + I->eraseFromParent(); + + return Copy; + } + + // Create a virtal register in *TLSBaseAddrReg, and populate it by + // inserting a copy instruction after I. Returns the new instruction. + MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) { + MachineFunction *MF = I->getParent()->getParent(); + const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>(); + const bool is64Bit = STI.is64Bit(); + const X86InstrInfo *TII = STI.getInstrInfo(); + + // Create a virtual register for the TLS base address. + MachineRegisterInfo &RegInfo = MF->getRegInfo(); + *TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit + ? &X86::GR64RegClass + : &X86::GR32RegClass); + + // Insert a copy from RAX/EAX to TLSBaseAddrReg. + MachineInstr *Next = I->getNextNode(); + MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(), + TII->get(TargetOpcode::COPY), + *TLSBaseAddrReg) + .addReg(is64Bit ? X86::RAX : X86::EAX); + + return Copy; + } + + const char *getPassName() const override { + return "Local Dynamic TLS Access Clean-up"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + }; +} + +char LDTLSCleanup::ID = 0; +FunctionPass* +llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); } diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h new file mode 100644 index 0000000..edd09d6 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h @@ -0,0 +1,571 @@ +//===-- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86INSTRINFO_H +#define LLVM_LIB_TARGET_X86_X86INSTRINFO_H + +#include "MCTargetDesc/X86BaseInfo.h" +#include "X86RegisterInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/Target/TargetInstrInfo.h" + +#define GET_INSTRINFO_HEADER +#include "X86GenInstrInfo.inc" + +namespace llvm { + class X86RegisterInfo; + class X86Subtarget; + +namespace X86 { + // X86 specific condition code. These correspond to X86_*_COND in + // X86InstrInfo.td. They must be kept in synch. + enum CondCode { + COND_A = 0, + COND_AE = 1, + COND_B = 2, + COND_BE = 3, + COND_E = 4, + COND_G = 5, + COND_GE = 6, + COND_L = 7, + COND_LE = 8, + COND_NE = 9, + COND_NO = 10, + COND_NP = 11, + COND_NS = 12, + COND_O = 13, + COND_P = 14, + COND_S = 15, + LAST_VALID_COND = COND_S, + + // Artificial condition codes. These are used by AnalyzeBranch + // to indicate a block terminated with two conditional branches to + // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE, + // which can't be represented on x86 with a single condition. These + // are never used in MachineInstrs. + COND_NE_OR_P, + COND_NP_OR_E, + + COND_INVALID + }; + + // Turn condition code into conditional branch opcode. + unsigned GetCondBranchFromCond(CondCode CC); + + /// \brief Return a set opcode for the given condition and whether it has + /// a memory operand. + unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false); + + /// \brief Return a cmov opcode for the given condition, register size in + /// bytes, and operand type. + unsigned getCMovFromCond(CondCode CC, unsigned RegBytes, + bool HasMemoryOperand = false); + + // Turn CMov opcode into condition code. + CondCode getCondFromCMovOpc(unsigned Opc); + + /// GetOppositeBranchCondition - Return the inverse of the specified cond, + /// e.g. turning COND_E to COND_NE. + CondCode GetOppositeBranchCondition(CondCode CC); +} // end namespace X86; + + +/// isGlobalStubReference - Return true if the specified TargetFlag operand is +/// a reference to a stub for a global, not the global itself. +inline static bool isGlobalStubReference(unsigned char TargetFlag) { + switch (TargetFlag) { + case X86II::MO_DLLIMPORT: // dllimport stub. + case X86II::MO_GOTPCREL: // rip-relative GOT reference. + case X86II::MO_GOT: // normal GOT reference. + case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Normal $non_lazy_ptr ref. + case X86II::MO_DARWIN_NONLAZY: // Normal $non_lazy_ptr ref. + case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Hidden $non_lazy_ptr ref. + return true; + default: + return false; + } +} + +/// isGlobalRelativeToPICBase - Return true if the specified global value +/// reference is relative to a 32-bit PIC base (X86ISD::GlobalBaseReg). If this +/// is true, the addressing mode has the PIC base register added in (e.g. EBX). +inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) { + switch (TargetFlag) { + case X86II::MO_GOTOFF: // isPICStyleGOT: local global. + case X86II::MO_GOT: // isPICStyleGOT: other global. + case X86II::MO_PIC_BASE_OFFSET: // Darwin local global. + case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Darwin/32 external global. + case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Darwin/32 hidden global. + case X86II::MO_TLVP: // ??? Pretty sure.. + return true; + default: + return false; + } +} + +inline static bool isScale(const MachineOperand &MO) { + return MO.isImm() && + (MO.getImm() == 1 || MO.getImm() == 2 || + MO.getImm() == 4 || MO.getImm() == 8); +} + +inline static bool isLeaMem(const MachineInstr *MI, unsigned Op) { + if (MI->getOperand(Op).isFI()) return true; + return Op+X86::AddrSegmentReg <= MI->getNumOperands() && + MI->getOperand(Op+X86::AddrBaseReg).isReg() && + isScale(MI->getOperand(Op+X86::AddrScaleAmt)) && + MI->getOperand(Op+X86::AddrIndexReg).isReg() && + (MI->getOperand(Op+X86::AddrDisp).isImm() || + MI->getOperand(Op+X86::AddrDisp).isGlobal() || + MI->getOperand(Op+X86::AddrDisp).isCPI() || + MI->getOperand(Op+X86::AddrDisp).isJTI()); +} + +inline static bool isMem(const MachineInstr *MI, unsigned Op) { + if (MI->getOperand(Op).isFI()) return true; + return Op+X86::AddrNumOperands <= MI->getNumOperands() && + MI->getOperand(Op+X86::AddrSegmentReg).isReg() && + isLeaMem(MI, Op); +} + +class X86InstrInfo final : public X86GenInstrInfo { + X86Subtarget &Subtarget; + const X86RegisterInfo RI; + + /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1, + /// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps. + /// + typedef DenseMap<unsigned, + std::pair<unsigned, unsigned> > RegOp2MemOpTableType; + RegOp2MemOpTableType RegOp2MemOpTable2Addr; + RegOp2MemOpTableType RegOp2MemOpTable0; + RegOp2MemOpTableType RegOp2MemOpTable1; + RegOp2MemOpTableType RegOp2MemOpTable2; + RegOp2MemOpTableType RegOp2MemOpTable3; + RegOp2MemOpTableType RegOp2MemOpTable4; + + /// MemOp2RegOpTable - Load / store unfolding opcode map. + /// + typedef DenseMap<unsigned, + std::pair<unsigned, unsigned> > MemOp2RegOpTableType; + MemOp2RegOpTableType MemOp2RegOpTable; + + static void AddTableEntry(RegOp2MemOpTableType &R2MTable, + MemOp2RegOpTableType &M2RTable, + unsigned RegOp, unsigned MemOp, unsigned Flags); + + virtual void anchor(); + + bool AnalyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + SmallVectorImpl<MachineInstr *> &CondBranches, + bool AllowModify) const; + +public: + explicit X86InstrInfo(X86Subtarget &STI); + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + const X86RegisterInfo &getRegisterInfo() const { return RI; } + + /// getSPAdjust - This returns the stack pointer adjustment made by + /// this instruction. For x86, we need to handle more complex call + /// sequences involving PUSHes. + int getSPAdjust(const MachineInstr *MI) const override; + + /// isCoalescableExtInstr - Return true if the instruction is a "coalescable" + /// extension instruction. That is, it's like a copy where it's legal for the + /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns + /// true, then it's expected the pre-extension value is available as a subreg + /// of the result register. This also returns the sub-register index in + /// SubIdx. + bool isCoalescableExtInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SubIdx) const override; + + unsigned isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const override; + /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination + /// stack locations as well. This uses a heuristic so it isn't + /// reliable for correctness. + unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const override; + + unsigned isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const override; + /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination + /// stack locations as well. This uses a heuristic so it isn't + /// reliable for correctness. + unsigned isStoreToStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const override; + + bool isReallyTriviallyReMaterializable(const MachineInstr *MI, + AliasAnalysis *AA) const override; + void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + unsigned DestReg, unsigned SubIdx, + const MachineInstr *Orig, + const TargetRegisterInfo &TRI) const override; + + /// Given an operand within a MachineInstr, insert preceding code to put it + /// into the right format for a particular kind of LEA instruction. This may + /// involve using an appropriate super-register instead (with an implicit use + /// of the original) or creating a new virtual register and inserting COPY + /// instructions to get the data into the right class. + /// + /// Reference parameters are set to indicate how caller should add this + /// operand to the LEA instruction. + bool classifyLEAReg(MachineInstr *MI, const MachineOperand &Src, + unsigned LEAOpcode, bool AllowSP, + unsigned &NewSrc, bool &isKill, + bool &isUndef, MachineOperand &ImplicitOp) const; + + /// convertToThreeAddress - This method must be implemented by targets that + /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target + /// may be able to convert a two-address instruction into a true + /// three-address instruction on demand. This allows the X86 target (for + /// example) to convert ADD and SHL instructions into LEA instructions if they + /// would require register copies due to two-addressness. + /// + /// This method returns a null pointer if the transformation cannot be + /// performed, otherwise it returns the new instruction. + /// + MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const override; + + /// Returns true iff the routine could find two commutable operands in the + /// given machine instruction. + /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their + /// input values can be re-defined in this method only if the input values + /// are not pre-defined, which is designated by the special value + /// 'CommuteAnyOperandIndex' assigned to it. + /// If both of indices are pre-defined and refer to some operands, then the + /// method simply returns true if the corresponding operands are commutable + /// and returns false otherwise. + /// + /// For example, calling this method this way: + /// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex; + /// findCommutedOpIndices(MI, Op1, Op2); + /// can be interpreted as a query asking to find an operand that would be + /// commutable with the operand#1. + bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const override; + + /// Returns true if the routine could find two commutable operands + /// in the given FMA instruction. Otherwise, returns false. + /// + /// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments. + /// The output indices of the commuted operands are returned in these + /// arguments. Also, the input values of these arguments may be preset either + /// to indices of operands that must be commuted or be equal to a special + /// value 'CommuteAnyOperandIndex' which means that the corresponding + /// operand index is not set and this method is free to pick any of + /// available commutable operands. + /// + /// For example, calling this method this way: + /// unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex; + /// findFMA3CommutedOpIndices(MI, Idx1, Idx2); + /// can be interpreted as a query asking if the operand #1 can be swapped + /// with any other available operand (e.g. operand #2, operand #3, etc.). + /// + /// The returned FMA opcode may differ from the opcode in the given MI. + /// For example, commuting the operands #1 and #3 in the following FMA + /// FMA213 #1, #2, #3 + /// results into instruction with adjusted opcode: + /// FMA231 #3, #2, #1 + bool findFMA3CommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const; + + /// Returns an adjusted FMA opcode that must be used in FMA instruction that + /// performs the same computations as the given MI but which has the operands + /// \p SrcOpIdx1 and \p SrcOpIdx2 commuted. + /// It may return 0 if it is unsafe to commute the operands. + /// + /// The returned FMA opcode may differ from the opcode in the given \p MI. + /// For example, commuting the operands #1 and #3 in the following FMA + /// FMA213 #1, #2, #3 + /// results into instruction with adjusted opcode: + /// FMA231 #3, #2, #1 + unsigned getFMA3OpcodeToCommuteOperands(MachineInstr *MI, + unsigned SrcOpIdx1, + unsigned SrcOpIdx2) const; + + // Branch analysis. + bool isUnpredicatedTerminator(const MachineInstr* MI) const override; + bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const override; + + bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, + unsigned &Offset, + const TargetRegisterInfo *TRI) const override; + bool AnalyzeBranchPredicate(MachineBasicBlock &MBB, + TargetInstrInfo::MachineBranchPredicate &MBP, + bool AllowModify = false) const override; + + unsigned RemoveBranch(MachineBasicBlock &MBB) const override; + unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, + DebugLoc DL) const override; + bool canInsertSelect(const MachineBasicBlock&, ArrayRef<MachineOperand> Cond, + unsigned, unsigned, int&, int&, int&) const override; + void insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DstReg, ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg) const override; + void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const override; + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill, + SmallVectorImpl<MachineOperand> &Addr, + const TargetRegisterClass *RC, + MachineInstr::mmo_iterator MMOBegin, + MachineInstr::mmo_iterator MMOEnd, + SmallVectorImpl<MachineInstr*> &NewMIs) const; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + void loadRegFromAddr(MachineFunction &MF, unsigned DestReg, + SmallVectorImpl<MachineOperand> &Addr, + const TargetRegisterClass *RC, + MachineInstr::mmo_iterator MMOBegin, + MachineInstr::mmo_iterator MMOEnd, + SmallVectorImpl<MachineInstr*> &NewMIs) const; + + bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + + /// foldMemoryOperand - If this target supports it, fold a load or store of + /// the specified stack slot into the specified machine instruction for the + /// specified operand(s). If this is possible, the target should perform the + /// folding and return true, otherwise it should return false. If it folds + /// the instruction, it is likely that the MachineInstruction the iterator + /// references has been changed. + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, + int FrameIndex) const override; + + /// foldMemoryOperand - Same as the previous version except it allows folding + /// of any load and store from / to any address, not just from a specific + /// stack slot. + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, + MachineInstr *LoadMI) const override; + + /// unfoldMemoryOperand - Separate a single instruction which folded a load or + /// a store or a load and a store into two or more instruction. If this is + /// possible, returns true as well as the new instructions by reference. + bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, + unsigned Reg, bool UnfoldLoad, bool UnfoldStore, + SmallVectorImpl<MachineInstr*> &NewMIs) const override; + + bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, + SmallVectorImpl<SDNode*> &NewNodes) const override; + + /// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new + /// instruction after load / store are unfolded from an instruction of the + /// specified opcode. It returns zero if the specified unfolding is not + /// possible. If LoadRegIndex is non-null, it is filled in with the operand + /// index of the operand which will hold the register holding the loaded + /// value. + unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, + bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex = nullptr) const override; + + /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler + /// to determine if two loads are loading from the same base address. It + /// should only return true if the base pointers are the same and the + /// only differences between the two addresses are the offset. It also returns + /// the offsets by reference. + bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, + int64_t &Offset2) const override; + + /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to + /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should + /// be scheduled togther. On some targets if two loads are loading from + /// addresses in the same cache line, it's better if they are scheduled + /// together. This function takes two integers that represent the load offsets + /// from the common base address. It returns true if it decides it's desirable + /// to schedule the two loads together. "NumLoads" is the number of loads that + /// have already been scheduled after Load1. + bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const override; + + bool shouldScheduleAdjacent(MachineInstr* First, + MachineInstr *Second) const override; + + void getNoopForMachoTarget(MCInst &NopInst) const override; + + bool + ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; + + /// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine + /// instruction that defines the specified register class. + bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; + + /// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction tha + /// would clobber the EFLAGS condition register. Note the result may be + /// conservative. If it cannot definitely determine the safety after visiting + /// a few instructions in each direction it assumes it's not safe. + bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + + /// True if MI has a condition code def, e.g. EFLAGS, that is + /// not marked dead. + bool hasLiveCondCodeDef(MachineInstr *MI) const; + + /// getGlobalBaseReg - Return a virtual register initialized with the + /// the global base register value. Output instructions required to + /// initialize the register in the function entry block, if necessary. + /// + unsigned getGlobalBaseReg(MachineFunction *MF) const; + + std::pair<uint16_t, uint16_t> + getExecutionDomain(const MachineInstr *MI) const override; + + void setExecutionDomain(MachineInstr *MI, unsigned Domain) const override; + + unsigned + getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const override; + unsigned getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum, + const TargetRegisterInfo *TRI) const override; + void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const override; + + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + unsigned OpNum, + ArrayRef<MachineOperand> MOs, + MachineBasicBlock::iterator InsertPt, + unsigned Size, unsigned Alignment, + bool AllowCommute) const; + + void + getUnconditionalBranch(MCInst &Branch, + const MCSymbolRefExpr *BranchTarget) const override; + + void getTrap(MCInst &MI) const override; + + unsigned getJumpInstrTableEntryBound() const override; + + bool isHighLatencyDef(int opc) const override; + + bool hasHighOperandLatency(const TargetSchedModel &SchedModel, + const MachineRegisterInfo *MRI, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, + unsigned UseIdx) const override; + + bool useMachineCombiner() const override { + return true; + } + + bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; + + bool hasReassociableOperands(const MachineInstr &Inst, + const MachineBasicBlock *MBB) const override; + + void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2, + MachineInstr &NewMI1, + MachineInstr &NewMI2) const override; + + /// analyzeCompare - For a comparison instruction, return the source registers + /// in SrcReg and SrcReg2 if having two register operands, and the value it + /// compares against in CmpValue. Return true if the comparison instruction + /// can be analyzed. + bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, + unsigned &SrcReg2, int &CmpMask, + int &CmpValue) const override; + + /// optimizeCompareInstr - Check if there exists an earlier instruction that + /// operates on the same source operands and sets flags in the same way as + /// Compare; remove Compare if possible. + bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, + unsigned SrcReg2, int CmpMask, int CmpValue, + const MachineRegisterInfo *MRI) const override; + + /// optimizeLoadInstr - Try to remove the load by folding it to a register + /// operand at the use. We fold the load instructions if and only if the + /// def and use are in the same BB. We only look at one load and see + /// whether it can be folded into MI. FoldAsLoadDefReg is the virtual register + /// defined by the load we are trying to fold. DefMI returns the machine + /// instruction that defines FoldAsLoadDefReg, and the function returns + /// the machine instruction generated due to folding. + MachineInstr* optimizeLoadInstr(MachineInstr *MI, + const MachineRegisterInfo *MRI, + unsigned &FoldAsLoadDefReg, + MachineInstr *&DefMI) const override; + + std::pair<unsigned, unsigned> + decomposeMachineOperandsTargetFlags(unsigned TF) const override; + + ArrayRef<std::pair<unsigned, const char *>> + getSerializableDirectMachineOperandTargetFlags() const override; + +protected: + /// Commutes the operands in the given instruction by changing the operands + /// order and/or changing the instruction's opcode and/or the immediate value + /// operand. + /// + /// The arguments 'CommuteOpIdx1' and 'CommuteOpIdx2' specify the operands + /// to be commuted. + /// + /// Do not call this method for a non-commutable instruction or + /// non-commutable operands. + /// Even though the instruction is commutable, the method may still + /// fail to commute the operands, null pointer is returned in such cases. + MachineInstr *commuteInstructionImpl(MachineInstr *MI, bool NewMI, + unsigned CommuteOpIdx1, + unsigned CommuteOpIdx2) const override; + +private: + MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc, + MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const; + + /// Handles memory folding for special case instructions, for instance those + /// requiring custom manipulation of the address. + MachineInstr *foldMemoryOperandCustom(MachineFunction &MF, MachineInstr *MI, + unsigned OpNum, + ArrayRef<MachineOperand> MOs, + MachineBasicBlock::iterator InsertPt, + unsigned Size, unsigned Align) const; + + /// isFrameOperand - Return true and the FrameIndex if the specified + /// operand and follow operands form a reference to the stack frame. + bool isFrameOperand(const MachineInstr *MI, unsigned int Op, + int &FrameIndex) const; +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td new file mode 100644 index 0000000..ea8e562 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td @@ -0,0 +1,3085 @@ +//===-- X86InstrInfo.td - Main X86 Instruction Definition --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 instruction set, defining the instructions, and +// properties of the instructions which are needed for code generation, machine +// code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// X86 specific DAG Nodes. +// + +def SDTIntShiftDOp: SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisInt<0>, SDTCisInt<3>]>; + +def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>; + +def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; +//def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; + +def SDTX86Cmov : SDTypeProfile<1, 4, + [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, + SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; + +// Unary and binary operator instructions that set EFLAGS as a side-effect. +def SDTUnaryArithWithFlags : SDTypeProfile<2, 1, + [SDTCisSameAs<0, 2>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; + +def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; + +// SDTBinaryArithWithFlagsInOut - RES1, EFLAGS = op LHS, RHS, EFLAGS +def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, + SDTCisVT<1, i32>, + SDTCisVT<4, i32>]>; +// RES1, RES2, FLAGS = op LHS, RHS +def SDT2ResultBinaryArithWithFlags : SDTypeProfile<3, 2, + [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; +def SDTX86BrCond : SDTypeProfile<0, 3, + [SDTCisVT<0, OtherVT>, + SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; + +def SDTX86SetCC : SDTypeProfile<1, 2, + [SDTCisVT<0, i8>, + SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; +def SDTX86SetCC_C : SDTypeProfile<1, 2, + [SDTCisInt<0>, + SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; + +def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>; + +def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>; + +def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>, + SDTCisVT<2, i8>]>; +def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; + +def SDTX86atomicBinary : SDTypeProfile<2, 3, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisPtrTy<2>, SDTCisInt<3>,SDTCisInt<4>]>; +def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>; + +def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>; +def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; + +def SDT_X86Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>; + +def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>, + SDTCisVT<1, iPTR>, + SDTCisVT<2, iPTR>]>; + +def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>, + SDTCisPtrTy<1>, + SDTCisVT<2, i32>, + SDTCisVT<3, i8>, + SDTCisVT<4, i32>]>; + +def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>; + +def SDTX86Void : SDTypeProfile<0, 0, []>; + +def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; + +def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; + +def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; + +def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>; + +def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER, + [SDNPHasChain,SDNPSideEffect]>; +def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER, + [SDNPHasChain]>; +def X86SFence : SDNode<"X86ISD::SFENCE", SDT_X86MEMBARRIER, + [SDNPHasChain]>; +def X86LFence : SDNode<"X86ISD::LFENCE", SDT_X86MEMBARRIER, + [SDNPHasChain]>; + + +def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>; +def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>; +def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>; +def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>; + +def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest>; +def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>; + +def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>; +def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond, + [SDNPHasChain]>; +def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>; +def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>; + +def X86sahf : SDNode<"X86ISD::SAHF", SDTX86sahf>; + +def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand, + [SDNPHasChain, SDNPSideEffect]>; + +def X86rdseed : SDNode<"X86ISD::RDSEED", SDTX86rdrand, + [SDNPHasChain, SDNPSideEffect]>; + +def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; + +def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def X86iret : SDNode<"X86ISD::IRET", SDTX86Ret, + [SDNPHasChain, SDNPOptInGlue]>; + +def X86vastart_save_xmm_regs : + SDNode<"X86ISD::VASTART_SAVE_XMM_REGS", + SDT_X86VASTART_SAVE_XMM_REGS, + [SDNPHasChain, SDNPVariadic]>; +def X86vaarg64 : + SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, + SDNPMemOperand]>; +def X86callseq_start : + SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart, + [SDNPHasChain, SDNPOutGlue]>; +def X86callseq_end : + SDNode<"ISD::CALLSEQ_END", SDT_X86CallSeqEnd, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def X86call : SDNode<"X86ISD::CALL", SDT_X86Call, + [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, + SDNPVariadic]>; + +def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore]>; +def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad]>; + +def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void, + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; +def X86rdtscp : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void, + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; +def X86rdpmc : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void, + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; + +def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>; +def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>; + +def X86RecoverFrameAlloc : SDNode<"ISD::LOCAL_RECOVER", + SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, + SDTCisInt<1>]>>; + +def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET, + [SDNPHasChain]>; + +def X86eh_sjlj_setjmp : SDNode<"X86ISD::EH_SJLJ_SETJMP", + SDTypeProfile<1, 1, [SDTCisInt<0>, + SDTCisPtrTy<1>]>, + [SDNPHasChain, SDNPSideEffect]>; +def X86eh_sjlj_longjmp : SDNode<"X86ISD::EH_SJLJ_LONGJMP", + SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPSideEffect]>; + +def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; + +def X86add_flag : SDNode<"X86ISD::ADD", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86sub_flag : SDNode<"X86ISD::SUB", SDTBinaryArithWithFlags>; +def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86umul_flag : SDNode<"X86ISD::UMUL", SDT2ResultBinaryArithWithFlags, + [SDNPCommutative]>; +def X86adc_flag : SDNode<"X86ISD::ADC", SDTBinaryArithWithFlagsInOut>; +def X86sbb_flag : SDNode<"X86ISD::SBB", SDTBinaryArithWithFlagsInOut>; + +def X86inc_flag : SDNode<"X86ISD::INC", SDTUnaryArithWithFlags>; +def X86dec_flag : SDNode<"X86ISD::DEC", SDTUnaryArithWithFlags>; +def X86or_flag : SDNode<"X86ISD::OR", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86xor_flag : SDNode<"X86ISD::XOR", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags, + [SDNPCommutative]>; + +def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>; + +def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>; + +def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDTX86Void, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; + +def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA, + [SDNPHasChain]>; + +def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +//===----------------------------------------------------------------------===// +// X86 Operand Definitions. +// + +// A version of ptr_rc which excludes SP, ESP, and RSP. This is used for +// the index operand of an address, to conform to x86 encoding restrictions. +def ptr_rc_nosp : PointerLikeRegClass<1>; + +// *mem - Operand definitions for the funky X86 addressing mode operands. +// +def X86MemAsmOperand : AsmOperandClass { + let Name = "Mem"; +} +let RenderMethod = "addMemOperands" in { + def X86Mem8AsmOperand : AsmOperandClass { let Name = "Mem8"; } + def X86Mem16AsmOperand : AsmOperandClass { let Name = "Mem16"; } + def X86Mem32AsmOperand : AsmOperandClass { let Name = "Mem32"; } + def X86Mem64AsmOperand : AsmOperandClass { let Name = "Mem64"; } + def X86Mem80AsmOperand : AsmOperandClass { let Name = "Mem80"; } + def X86Mem128AsmOperand : AsmOperandClass { let Name = "Mem128"; } + def X86Mem256AsmOperand : AsmOperandClass { let Name = "Mem256"; } + def X86Mem512AsmOperand : AsmOperandClass { let Name = "Mem512"; } + // Gather mem operands + def X86MemVX32Operand : AsmOperandClass { let Name = "MemVX32"; } + def X86MemVY32Operand : AsmOperandClass { let Name = "MemVY32"; } + def X86MemVZ32Operand : AsmOperandClass { let Name = "MemVZ32"; } + def X86MemVX64Operand : AsmOperandClass { let Name = "MemVX64"; } + def X86MemVY64Operand : AsmOperandClass { let Name = "MemVY64"; } + def X86MemVZ64Operand : AsmOperandClass { let Name = "MemVZ64"; } + def X86MemVX32XOperand : AsmOperandClass { let Name = "MemVX32X"; } + def X86MemVY32XOperand : AsmOperandClass { let Name = "MemVY32X"; } + def X86MemVX64XOperand : AsmOperandClass { let Name = "MemVX64X"; } + def X86MemVY64XOperand : AsmOperandClass { let Name = "MemVY64X"; } +} + +def X86AbsMemAsmOperand : AsmOperandClass { + let Name = "AbsMem"; + let SuperClasses = [X86MemAsmOperand]; +} + +class X86MemOperand<string printMethod, + AsmOperandClass parserMatchClass = X86MemAsmOperand> : Operand<iPTR> { + let PrintMethod = printMethod; + let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm); + let ParserMatchClass = parserMatchClass; + let OperandType = "OPERAND_MEMORY"; +} + +// Gather mem operands +class X86VMemOperand<RegisterClass RC, string printMethod, + AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { + let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, i8imm); +} + +def anymem : X86MemOperand<"printanymem">; + +def opaque32mem : X86MemOperand<"printopaquemem">; +def opaque48mem : X86MemOperand<"printopaquemem">; +def opaque80mem : X86MemOperand<"printopaquemem">; +def opaque512mem : X86MemOperand<"printopaquemem">; + +def i8mem : X86MemOperand<"printi8mem", X86Mem8AsmOperand>; +def i16mem : X86MemOperand<"printi16mem", X86Mem16AsmOperand>; +def i32mem : X86MemOperand<"printi32mem", X86Mem32AsmOperand>; +def i64mem : X86MemOperand<"printi64mem", X86Mem64AsmOperand>; +def i128mem : X86MemOperand<"printi128mem", X86Mem128AsmOperand>; +def i256mem : X86MemOperand<"printi256mem", X86Mem256AsmOperand>; +def i512mem : X86MemOperand<"printi512mem", X86Mem512AsmOperand>; +def f32mem : X86MemOperand<"printf32mem", X86Mem32AsmOperand>; +def f64mem : X86MemOperand<"printf64mem", X86Mem64AsmOperand>; +def f80mem : X86MemOperand<"printf80mem", X86Mem80AsmOperand>; +def f128mem : X86MemOperand<"printf128mem", X86Mem128AsmOperand>; +def f256mem : X86MemOperand<"printf256mem", X86Mem256AsmOperand>; +def f512mem : X86MemOperand<"printf512mem", X86Mem512AsmOperand>; + +def v512mem : X86VMemOperand<VR512, "printf512mem", X86Mem512AsmOperand>; + +// Gather mem operands +def vx32mem : X86VMemOperand<VR128, "printi32mem", X86MemVX32Operand>; +def vy32mem : X86VMemOperand<VR256, "printi32mem", X86MemVY32Operand>; +def vx64mem : X86VMemOperand<VR128, "printi64mem", X86MemVX64Operand>; +def vy64mem : X86VMemOperand<VR256, "printi64mem", X86MemVY64Operand>; + +def vx32xmem : X86VMemOperand<VR128X, "printi32mem", X86MemVX32XOperand>; +def vx64xmem : X86VMemOperand<VR128X, "printi32mem", X86MemVX64XOperand>; +def vy32xmem : X86VMemOperand<VR256X, "printi32mem", X86MemVY32XOperand>; +def vy64xmem : X86VMemOperand<VR256X, "printi64mem", X86MemVY64XOperand>; +def vz32mem : X86VMemOperand<VR512, "printi32mem", X86MemVZ32Operand>; +def vz64mem : X86VMemOperand<VR512, "printi64mem", X86MemVZ64Operand>; + +// A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead +// of a plain GPR, so that it doesn't potentially require a REX prefix. +def ptr_rc_norex : PointerLikeRegClass<2>; +def ptr_rc_norex_nosp : PointerLikeRegClass<3>; + +def i8mem_NOREX : Operand<iPTR> { + let PrintMethod = "printi8mem"; + let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm, i8imm); + let ParserMatchClass = X86Mem8AsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + +// GPRs available for tailcall. +// It represents GR32_TC, GR64_TC or GR64_TCW64. +def ptr_rc_tailcall : PointerLikeRegClass<4>; + +// Special i32mem for addresses of load folding tail calls. These are not +// allowed to use callee-saved registers since they must be scheduled +// after callee-saved register are popped. +def i32mem_TC : Operand<i32> { + let PrintMethod = "printi32mem"; + let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall, + i32imm, i8imm); + let ParserMatchClass = X86Mem32AsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + +// Special i64mem for addresses of load folding tail calls. These are not +// allowed to use callee-saved registers since they must be scheduled +// after callee-saved register are popped. +def i64mem_TC : Operand<i64> { + let PrintMethod = "printi64mem"; + let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, + ptr_rc_tailcall, i32imm, i8imm); + let ParserMatchClass = X86Mem64AsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + +let OperandType = "OPERAND_PCREL", + ParserMatchClass = X86AbsMemAsmOperand, + PrintMethod = "printPCRelImm" in { +def i32imm_pcrel : Operand<i32>; +def i16imm_pcrel : Operand<i16>; + +// Branch targets have OtherVT type and print as pc-relative values. +def brtarget : Operand<OtherVT>; +def brtarget8 : Operand<OtherVT>; + +} + +// Special parser to detect 16-bit mode to select 16-bit displacement. +def X86AbsMem16AsmOperand : AsmOperandClass { + let Name = "AbsMem16"; + let RenderMethod = "addAbsMemOperands"; + let SuperClasses = [X86AbsMemAsmOperand]; +} + +// Branch targets have OtherVT type and print as pc-relative values. +let OperandType = "OPERAND_PCREL", + PrintMethod = "printPCRelImm" in { +let ParserMatchClass = X86AbsMem16AsmOperand in + def brtarget16 : Operand<OtherVT>; +let ParserMatchClass = X86AbsMemAsmOperand in + def brtarget32 : Operand<OtherVT>; +} + +let RenderMethod = "addSrcIdxOperands" in { + def X86SrcIdx8Operand : AsmOperandClass { + let Name = "SrcIdx8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86SrcIdx16Operand : AsmOperandClass { + let Name = "SrcIdx16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86SrcIdx32Operand : AsmOperandClass { + let Name = "SrcIdx32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86SrcIdx64Operand : AsmOperandClass { + let Name = "SrcIdx64"; + let SuperClasses = [X86Mem64AsmOperand]; + } +} // RenderMethod = "addSrcIdxOperands" + +let RenderMethod = "addDstIdxOperands" in { + def X86DstIdx8Operand : AsmOperandClass { + let Name = "DstIdx8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86DstIdx16Operand : AsmOperandClass { + let Name = "DstIdx16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86DstIdx32Operand : AsmOperandClass { + let Name = "DstIdx32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86DstIdx64Operand : AsmOperandClass { + let Name = "DstIdx64"; + let SuperClasses = [X86Mem64AsmOperand]; + } +} // RenderMethod = "addDstIdxOperands" + +let RenderMethod = "addMemOffsOperands" in { + def X86MemOffs16_8AsmOperand : AsmOperandClass { + let Name = "MemOffs16_8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86MemOffs16_16AsmOperand : AsmOperandClass { + let Name = "MemOffs16_16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86MemOffs16_32AsmOperand : AsmOperandClass { + let Name = "MemOffs16_32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86MemOffs32_8AsmOperand : AsmOperandClass { + let Name = "MemOffs32_8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86MemOffs32_16AsmOperand : AsmOperandClass { + let Name = "MemOffs32_16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86MemOffs32_32AsmOperand : AsmOperandClass { + let Name = "MemOffs32_32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86MemOffs32_64AsmOperand : AsmOperandClass { + let Name = "MemOffs32_64"; + let SuperClasses = [X86Mem64AsmOperand]; + } + def X86MemOffs64_8AsmOperand : AsmOperandClass { + let Name = "MemOffs64_8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86MemOffs64_16AsmOperand : AsmOperandClass { + let Name = "MemOffs64_16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86MemOffs64_32AsmOperand : AsmOperandClass { + let Name = "MemOffs64_32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86MemOffs64_64AsmOperand : AsmOperandClass { + let Name = "MemOffs64_64"; + let SuperClasses = [X86Mem64AsmOperand]; + } +} // RenderMethod = "addMemOffsOperands" + +class X86SrcIdxOperand<string printMethod, AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { + let MIOperandInfo = (ops ptr_rc, i8imm); +} + +class X86DstIdxOperand<string printMethod, AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { + let MIOperandInfo = (ops ptr_rc); +} + +def srcidx8 : X86SrcIdxOperand<"printSrcIdx8", X86SrcIdx8Operand>; +def srcidx16 : X86SrcIdxOperand<"printSrcIdx16", X86SrcIdx16Operand>; +def srcidx32 : X86SrcIdxOperand<"printSrcIdx32", X86SrcIdx32Operand>; +def srcidx64 : X86SrcIdxOperand<"printSrcIdx64", X86SrcIdx64Operand>; +def dstidx8 : X86DstIdxOperand<"printDstIdx8", X86DstIdx8Operand>; +def dstidx16 : X86DstIdxOperand<"printDstIdx16", X86DstIdx16Operand>; +def dstidx32 : X86DstIdxOperand<"printDstIdx32", X86DstIdx32Operand>; +def dstidx64 : X86DstIdxOperand<"printDstIdx64", X86DstIdx64Operand>; + +class X86MemOffsOperand<Operand immOperand, string printMethod, + AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { + let MIOperandInfo = (ops immOperand, i8imm); +} + +def offset16_8 : X86MemOffsOperand<i16imm, "printMemOffs8", + X86MemOffs16_8AsmOperand>; +def offset16_16 : X86MemOffsOperand<i16imm, "printMemOffs16", + X86MemOffs16_16AsmOperand>; +def offset16_32 : X86MemOffsOperand<i16imm, "printMemOffs32", + X86MemOffs16_32AsmOperand>; +def offset32_8 : X86MemOffsOperand<i32imm, "printMemOffs8", + X86MemOffs32_8AsmOperand>; +def offset32_16 : X86MemOffsOperand<i32imm, "printMemOffs16", + X86MemOffs32_16AsmOperand>; +def offset32_32 : X86MemOffsOperand<i32imm, "printMemOffs32", + X86MemOffs32_32AsmOperand>; +def offset32_64 : X86MemOffsOperand<i32imm, "printMemOffs64", + X86MemOffs32_64AsmOperand>; +def offset64_8 : X86MemOffsOperand<i64imm, "printMemOffs8", + X86MemOffs64_8AsmOperand>; +def offset64_16 : X86MemOffsOperand<i64imm, "printMemOffs16", + X86MemOffs64_16AsmOperand>; +def offset64_32 : X86MemOffsOperand<i64imm, "printMemOffs32", + X86MemOffs64_32AsmOperand>; +def offset64_64 : X86MemOffsOperand<i64imm, "printMemOffs64", + X86MemOffs64_64AsmOperand>; + +def SSECC : Operand<i8> { + let PrintMethod = "printSSEAVXCC"; + let OperandType = "OPERAND_IMMEDIATE"; +} + +def i8immZExt3 : ImmLeaf<i8, [{ + return Imm >= 0 && Imm < 8; +}]>; + +def AVXCC : Operand<i8> { + let PrintMethod = "printSSEAVXCC"; + let OperandType = "OPERAND_IMMEDIATE"; +} + +def i8immZExt5 : ImmLeaf<i8, [{ + return Imm >= 0 && Imm < 32; +}]>; + +def AVX512ICC : Operand<i8> { + let PrintMethod = "printSSEAVXCC"; + let OperandType = "OPERAND_IMMEDIATE"; +} + +def XOPCC : Operand<i8> { + let PrintMethod = "printXOPCC"; + let OperandType = "OPERAND_IMMEDIATE"; +} + +class ImmSExtAsmOperandClass : AsmOperandClass { + let SuperClasses = [ImmAsmOperand]; + let RenderMethod = "addImmOperands"; +} + +def X86GR32orGR64AsmOperand : AsmOperandClass { + let Name = "GR32orGR64"; +} + +def GR32orGR64 : RegisterOperand<GR32> { + let ParserMatchClass = X86GR32orGR64AsmOperand; +} +def AVX512RCOperand : AsmOperandClass { + let Name = "AVX512RC"; +} +def AVX512RC : Operand<i32> { + let PrintMethod = "printRoundingControl"; + let OperandType = "OPERAND_IMMEDIATE"; + let ParserMatchClass = AVX512RCOperand; +} + +// Sign-extended immediate classes. We don't need to define the full lattice +// here because there is no instruction with an ambiguity between ImmSExti64i32 +// and ImmSExti32i8. +// +// The strange ranges come from the fact that the assembler always works with +// 64-bit immediates, but for a 16-bit target value we want to accept both "-1" +// (which will be a -1ULL), and "0xFF" (-1 in 16-bits). + +// [0, 0x7FFFFFFF] | +// [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF] +def ImmSExti64i32AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti64i32"; +} + +// [0, 0x0000007F] | [0x000000000000FF80, 0x000000000000FFFF] | +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmSExti16i8AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti16i8"; + let SuperClasses = [ImmSExti64i32AsmOperand]; +} + +// [0, 0x0000007F] | [0x00000000FFFFFF80, 0x00000000FFFFFFFF] | +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti32i8"; +} + +// [0, 0x0000007F] | +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti64i8"; + let SuperClasses = [ImmSExti16i8AsmOperand, ImmSExti32i8AsmOperand, + ImmSExti64i32AsmOperand]; +} + +// Unsigned immediate used by SSE/AVX instructions +// [0, 0xFF] +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmUnsignedi8AsmOperand : AsmOperandClass { + let Name = "ImmUnsignedi8"; + let RenderMethod = "addImmOperands"; +} + +// A couple of more descriptive operand definitions. +// 16-bits but only 8 bits are significant. +def i16i8imm : Operand<i16> { + let ParserMatchClass = ImmSExti16i8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} +// 32-bits but only 8 bits are significant. +def i32i8imm : Operand<i32> { + let ParserMatchClass = ImmSExti32i8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// 64-bits but only 32 bits are significant. +def i64i32imm : Operand<i64> { + let ParserMatchClass = ImmSExti64i32AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// 64-bits but only 8 bits are significant. +def i64i8imm : Operand<i64> { + let ParserMatchClass = ImmSExti64i8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// Unsigned 8-bit immediate used by SSE/AVX instructions. +def u8imm : Operand<i8> { + let PrintMethod = "printU8Imm"; + let ParserMatchClass = ImmUnsignedi8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// 32-bit immediate but only 8-bits are significant and they are unsigned. +// Used by some SSE/AVX instructions that use intrinsics. +def i32u8imm : Operand<i32> { + let PrintMethod = "printU8Imm"; + let ParserMatchClass = ImmUnsignedi8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// 64-bits but only 32 bits are significant, and those bits are treated as being +// pc relative. +def i64i32imm_pcrel : Operand<i64> { + let PrintMethod = "printPCRelImm"; + let ParserMatchClass = X86AbsMemAsmOperand; + let OperandType = "OPERAND_PCREL"; +} + +def lea64_32mem : Operand<i32> { + let PrintMethod = "printanymem"; + let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm); + let ParserMatchClass = X86MemAsmOperand; +} + +// Memory operands that use 64-bit pointers in both ILP32 and LP64. +def lea64mem : Operand<i64> { + let PrintMethod = "printanymem"; + let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm); + let ParserMatchClass = X86MemAsmOperand; +} + + +//===----------------------------------------------------------------------===// +// X86 Complex Pattern Definitions. +// + +// Define X86-specific addressing mode. +def addr : ComplexPattern<iPTR, 5, "selectAddr", [], [SDNPWantParent]>; +def lea32addr : ComplexPattern<i32, 5, "selectLEAAddr", + [add, sub, mul, X86mul_imm, shl, or, frameindex], + []>; +// In 64-bit mode 32-bit LEAs can use RIP-relative addressing. +def lea64_32addr : ComplexPattern<i32, 5, "selectLEA64_32Addr", + [add, sub, mul, X86mul_imm, shl, or, + frameindex, X86WrapperRIP], + []>; + +def tls32addr : ComplexPattern<i32, 5, "selectTLSADDRAddr", + [tglobaltlsaddr], []>; + +def tls32baseaddr : ComplexPattern<i32, 5, "selectTLSADDRAddr", + [tglobaltlsaddr], []>; + +def lea64addr : ComplexPattern<i64, 5, "selectLEAAddr", + [add, sub, mul, X86mul_imm, shl, or, frameindex, + X86WrapperRIP], []>; + +def tls64addr : ComplexPattern<i64, 5, "selectTLSADDRAddr", + [tglobaltlsaddr], []>; + +def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr", + [tglobaltlsaddr], []>; + +def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>; + +//===----------------------------------------------------------------------===// +// X86 Instruction Predicate Definitions. +def HasCMov : Predicate<"Subtarget->hasCMov()">; +def NoCMov : Predicate<"!Subtarget->hasCMov()">; + +def HasMMX : Predicate<"Subtarget->hasMMX()">; +def Has3DNow : Predicate<"Subtarget->has3DNow()">; +def Has3DNowA : Predicate<"Subtarget->has3DNowA()">; +def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; +def UseSSE1 : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">; +def HasSSE2 : Predicate<"Subtarget->hasSSE2()">; +def UseSSE2 : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">; +def HasSSE3 : Predicate<"Subtarget->hasSSE3()">; +def UseSSE3 : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">; +def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">; +def UseSSSE3 : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">; +def HasSSE41 : Predicate<"Subtarget->hasSSE41()">; +def NoSSE41 : Predicate<"!Subtarget->hasSSE41()">; +def UseSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">; +def HasSSE42 : Predicate<"Subtarget->hasSSE42()">; +def UseSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">; +def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">; +def HasAVX : Predicate<"Subtarget->hasAVX()">; +def HasAVX2 : Predicate<"Subtarget->hasAVX2()">; +def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">; +def HasAVX512 : Predicate<"Subtarget->hasAVX512()">, + AssemblerPredicate<"FeatureAVX512", "AVX-512 ISA">; +def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">; +def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">; +def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">; +def HasCDI : Predicate<"Subtarget->hasCDI()">, + AssemblerPredicate<"FeatureCDI", "AVX-512 CD ISA">; +def HasPFI : Predicate<"Subtarget->hasPFI()">, + AssemblerPredicate<"FeaturePFI", "AVX-512 PF ISA">; +def HasERI : Predicate<"Subtarget->hasERI()">, + AssemblerPredicate<"FeatureERI", "AVX-512 ER ISA">; +def HasDQI : Predicate<"Subtarget->hasDQI()">, + AssemblerPredicate<"FeatureDQI", "AVX-512 DQ ISA">; +def NoDQI : Predicate<"!Subtarget->hasDQI()">; +def HasBWI : Predicate<"Subtarget->hasBWI()">, + AssemblerPredicate<"FeatureBWI", "AVX-512 BW ISA">; +def NoBWI : Predicate<"!Subtarget->hasBWI()">; +def HasVLX : Predicate<"Subtarget->hasVLX()">, + AssemblerPredicate<"FeatureVLX", "AVX-512 VL ISA">; +def NoVLX : Predicate<"!Subtarget->hasVLX()">; +def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">; +def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">; +def PKU : Predicate<"!Subtarget->hasPKU()">; + +def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; +def HasAES : Predicate<"Subtarget->hasAES()">; +def HasFXSR : Predicate<"Subtarget->hasFXSR()">; +def HasXSAVE : Predicate<"Subtarget->hasXSAVE()">; +def HasXSAVEOPT : Predicate<"Subtarget->hasXSAVEOPT()">; +def HasXSAVEC : Predicate<"Subtarget->hasXSAVEC()">; +def HasXSAVES : Predicate<"Subtarget->hasXSAVES()">; +def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">; +def HasFMA : Predicate<"Subtarget->hasFMA()">; +def UseFMAOnAVX : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">; +def HasFMA4 : Predicate<"Subtarget->hasFMA4()">; +def HasXOP : Predicate<"Subtarget->hasXOP()">; +def HasTBM : Predicate<"Subtarget->hasTBM()">; +def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">; +def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">; +def HasF16C : Predicate<"Subtarget->hasF16C()">; +def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">; +def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">; +def HasBMI : Predicate<"Subtarget->hasBMI()">; +def HasBMI2 : Predicate<"Subtarget->hasBMI2()">; +def HasRTM : Predicate<"Subtarget->hasRTM()">; +def HasHLE : Predicate<"Subtarget->hasHLE()">; +def HasTSX : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">; +def HasADX : Predicate<"Subtarget->hasADX()">; +def HasSHA : Predicate<"Subtarget->hasSHA()">; +def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; +def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; +def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">; +def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">; +def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; +def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; +def HasMPX : Predicate<"Subtarget->hasMPX()">; +def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">; +def Not64BitMode : Predicate<"!Subtarget->is64Bit()">, + AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">; +def In64BitMode : Predicate<"Subtarget->is64Bit()">, + AssemblerPredicate<"Mode64Bit", "64-bit mode">; +def IsLP64 : Predicate<"Subtarget->isTarget64BitLP64()">; +def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">; +def In16BitMode : Predicate<"Subtarget->is16Bit()">, + AssemblerPredicate<"Mode16Bit", "16-bit mode">; +def Not16BitMode : Predicate<"!Subtarget->is16Bit()">, + AssemblerPredicate<"!Mode16Bit", "Not 16-bit mode">; +def In32BitMode : Predicate<"Subtarget->is32Bit()">, + AssemblerPredicate<"Mode32Bit", "32-bit mode">; +def IsWin64 : Predicate<"Subtarget->isTargetWin64()">; +def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">; +def IsPS4 : Predicate<"Subtarget->isTargetPS4()">; +def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">; +def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; +def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">; +def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">; +def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">; +def FarData : Predicate<"TM.getCodeModel() != CodeModel::Small &&" + "TM.getCodeModel() != CodeModel::Kernel">; +def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||" + "TM.getCodeModel() == CodeModel::Kernel">; +def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">; +def IsNotPIC : Predicate<"TM.getRelocationModel() != Reloc::PIC_">; +def OptForSize : Predicate<"OptForSize">; +def OptForSpeed : Predicate<"!OptForSize">; +def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; +def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; +def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">; +def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">; +def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">; + +//===----------------------------------------------------------------------===// +// X86 Instruction Format Definitions. +// + +include "X86InstrFormats.td" + +//===----------------------------------------------------------------------===// +// Pattern fragments. +// + +// X86 specific condition code. These correspond to CondCode in +// X86InstrInfo.h. They must be kept in synch. +def X86_COND_A : PatLeaf<(i8 0)>; // alt. COND_NBE +def X86_COND_AE : PatLeaf<(i8 1)>; // alt. COND_NC +def X86_COND_B : PatLeaf<(i8 2)>; // alt. COND_C +def X86_COND_BE : PatLeaf<(i8 3)>; // alt. COND_NA +def X86_COND_E : PatLeaf<(i8 4)>; // alt. COND_Z +def X86_COND_G : PatLeaf<(i8 5)>; // alt. COND_NLE +def X86_COND_GE : PatLeaf<(i8 6)>; // alt. COND_NL +def X86_COND_L : PatLeaf<(i8 7)>; // alt. COND_NGE +def X86_COND_LE : PatLeaf<(i8 8)>; // alt. COND_NG +def X86_COND_NE : PatLeaf<(i8 9)>; // alt. COND_NZ +def X86_COND_NO : PatLeaf<(i8 10)>; +def X86_COND_NP : PatLeaf<(i8 11)>; // alt. COND_PO +def X86_COND_NS : PatLeaf<(i8 12)>; +def X86_COND_O : PatLeaf<(i8 13)>; +def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE +def X86_COND_S : PatLeaf<(i8 15)>; + +// Predicate used to help when pattern matching LZCNT/TZCNT. +def X86_COND_E_OR_NE : ImmLeaf<i8, [{ + return (Imm == X86::COND_E) || (Imm == X86::COND_NE); +}]>; + + +def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>; +def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>; +def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>; + +// If we have multiple users of an immediate, it's much smaller to reuse +// the register, rather than encode the immediate in every instruction. +// This has the risk of increasing register pressure from stretched live +// ranges, however, the immediates should be trivial to rematerialize by +// the RA in the event of high register pressure. +// TODO : This is currently enabled for stores and binary ops. There are more +// cases for which this can be enabled, though this catches the bulk of the +// issues. +// TODO2 : This should really also be enabled under O2, but there's currently +// an issue with RA where we don't pull the constants into their users +// when we rematerialize them. I'll follow-up on enabling O2 after we fix that +// issue. +// TODO3 : This is currently limited to single basic blocks (DAG creation +// pulls block immediates to the top and merges them if necessary). +// Eventually, it would be nice to allow ConstantHoisting to merge constants +// globally for potentially added savings. +// +def imm8_su : PatLeaf<(i8 imm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def imm16_su : PatLeaf<(i16 imm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def imm32_su : PatLeaf<(i32 imm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; + +def i16immSExt8_su : PatLeaf<(i16immSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def i32immSExt8_su : PatLeaf<(i32immSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; + + +def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>; + + +// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit +// unsigned field. +def i64immZExt32 : ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>; + +def i64immZExt32SExt8 : ImmLeaf<i64, [{ + return isUInt<32>(Imm) && isInt<8>(static_cast<int32_t>(Imm)); +}]>; + +// Helper fragments for loads. +// It's always safe to treat a anyext i16 load as a i32 load if the i16 is +// known to be 32-bit aligned or better. Ditto for i8 to i16. +def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{ + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::LoadExtType ExtType = LD->getExtensionType(); + if (ExtType == ISD::NON_EXTLOAD) + return true; + if (ExtType == ISD::EXTLOAD) + return LD->getAlignment() >= 2 && !LD->isVolatile(); + return false; +}]>; + +def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),[{ + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::LoadExtType ExtType = LD->getExtensionType(); + if (ExtType == ISD::EXTLOAD) + return LD->getAlignment() >= 2 && !LD->isVolatile(); + return false; +}]>; + +def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{ + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::LoadExtType ExtType = LD->getExtensionType(); + if (ExtType == ISD::NON_EXTLOAD) + return true; + if (ExtType == ISD::EXTLOAD) + return LD->getAlignment() >= 4 && !LD->isVolatile(); + return false; +}]>; + +def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>; +def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>; +def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>; +def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>; +def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>; +def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>; + +def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>; +def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>; +def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>; +def sextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>; +def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>; +def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>; + +def zextloadi8i1 : PatFrag<(ops node:$ptr), (i8 (zextloadi1 node:$ptr))>; +def zextloadi16i1 : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>; +def zextloadi32i1 : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>; +def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>; +def zextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>; +def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>; +def zextloadi64i1 : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>; +def zextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>; +def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>; +def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>; + +def extloadi8i1 : PatFrag<(ops node:$ptr), (i8 (extloadi1 node:$ptr))>; +def extloadi16i1 : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>; +def extloadi32i1 : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>; +def extloadi16i8 : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>; +def extloadi32i8 : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>; +def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>; +def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>; +def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>; +def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>; +def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>; + + +// An 'and' node with a single use. +def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{ + return N->hasOneUse(); +}]>; +// An 'srl' node with a single use. +def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{ + return N->hasOneUse(); +}]>; +// An 'trunc' node with a single use. +def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{ + return N->hasOneUse(); +}]>; + +//===----------------------------------------------------------------------===// +// Instruction list. +// + +// Nop +let hasSideEffects = 0, SchedRW = [WriteZero] in { + def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>; + def NOOPW : I<0x1f, MRMXm, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16; + def NOOPL : I<0x1f, MRMXm, (outs), (ins i32mem:$zero), + "nop{l}\t$zero", [], IIC_NOP>, TB, OpSize32; +} + + +// Constructing a stack frame. +def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl), + "enter\t$len, $lvl", [], IIC_ENTER>, Sched<[WriteMicrocoded]>; + +let SchedRW = [WriteALU] in { +let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, hasSideEffects=0 in +def LEAVE : I<0xC9, RawFrm, + (outs), (ins), "leave", [], IIC_LEAVE>, + Requires<[Not64BitMode]>; + +let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, hasSideEffects = 0 in +def LEAVE64 : I<0xC9, RawFrm, + (outs), (ins), "leave", [], IIC_LEAVE>, + Requires<[In64BitMode]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +// + +let Defs = [ESP], Uses = [ESP], hasSideEffects=0 in { +let mayLoad = 1, SchedRW = [WriteLoad] in { +def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [], + IIC_POP_REG16>, OpSize16; +def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [], + IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>; +def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [], + IIC_POP_REG>, OpSize16; +def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [], + IIC_POP_MEM>, OpSize16; +def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [], + IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>; +def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [], + IIC_POP_MEM>, OpSize32, Requires<[Not64BitMode]>; +} // mayLoad, SchedRW + +let mayStore = 1, SchedRW = [WriteStore] in { +def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[], + IIC_PUSH_REG>, OpSize16; +def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[], + IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>; +def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[], + IIC_PUSH_REG>, OpSize16; +def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[], + IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>; + +def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm), + "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16; +def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm), + "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16; + +def PUSH32i8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm), + "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32, + Requires<[Not64BitMode]>; +def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm), + "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32, + Requires<[Not64BitMode]>; +} // mayStore, SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in { +def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[], + IIC_PUSH_MEM>, OpSize16; +def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[], + IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>; +} // mayLoad, mayStore, SchedRW + +} + +let mayLoad = 1, mayStore = 1, usesCustomInserter = 1, + SchedRW = [WriteRMW], Defs = [ESP] in { + let Uses = [ESP, EFLAGS] in + def RDFLAGS32 : PseudoI<(outs GR32:$dst), (ins), + [(set GR32:$dst, (int_x86_flags_read_u32))]>, + Requires<[Not64BitMode]>; + + let Uses = [RSP, EFLAGS] in + def RDFLAGS64 : PseudoI<(outs GR64:$dst), (ins), + [(set GR64:$dst, (int_x86_flags_read_u64))]>, + Requires<[In64BitMode]>; +} + +let mayLoad = 1, mayStore = 1, usesCustomInserter = 1, + SchedRW = [WriteRMW] in { + let Defs = [ESP, EFLAGS], Uses = [ESP] in + def WRFLAGS32 : PseudoI<(outs), (ins GR32:$src), + [(int_x86_flags_write_u32 GR32:$src)]>, + Requires<[Not64BitMode]>; + + let Defs = [RSP, EFLAGS], Uses = [RSP] in + def WRFLAGS64 : PseudoI<(outs), (ins GR64:$src), + [(int_x86_flags_write_u64 GR64:$src)]>, + Requires<[In64BitMode]>; +} + +let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0, + SchedRW = [WriteLoad] in { +def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, + OpSize16; +def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>, + OpSize32, Requires<[Not64BitMode]>; +} + +let Defs = [ESP], Uses = [ESP, EFLAGS], mayStore = 1, hasSideEffects=0, + SchedRW = [WriteStore] in { +def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>, + OpSize16; +def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>, + OpSize32, Requires<[Not64BitMode]>; +} + +let Defs = [RSP], Uses = [RSP], hasSideEffects=0 in { +let mayLoad = 1, SchedRW = [WriteLoad] in { +def POP64r : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [], + IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>; +def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [], + IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>; +def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [], + IIC_POP_MEM>, OpSize32, Requires<[In64BitMode]>; +} // mayLoad, SchedRW +let mayStore = 1, SchedRW = [WriteStore] in { +def PUSH64r : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [], + IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>; +def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [], + IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>; +} // mayStore, SchedRW +let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in { +def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [], + IIC_PUSH_MEM>, OpSize32, Requires<[In64BitMode]>; +} // mayLoad, mayStore, SchedRW +} + +let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1, + SchedRW = [WriteStore] in { +def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm), + "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32, + Requires<[In64BitMode]>; +def PUSH64i32 : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm), + "push{q}\t$imm", [], IIC_PUSH_IMM>, OpSize32, + Requires<[In64BitMode]>; +} + +let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in +def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>, + OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>; +let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, hasSideEffects=0 in +def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>, + OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>; + +let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP], + mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteLoad] in { +def POPA32 : I<0x61, RawFrm, (outs), (ins), "popal", [], IIC_POP_A>, + OpSize32, Requires<[Not64BitMode]>; +def POPA16 : I<0x61, RawFrm, (outs), (ins), "popaw", [], IIC_POP_A>, + OpSize16, Requires<[Not64BitMode]>; +} +let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], + mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in { +def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pushal", [], IIC_PUSH_A>, + OpSize32, Requires<[Not64BitMode]>; +def PUSHA16 : I<0x60, RawFrm, (outs), (ins), "pushaw", [], IIC_PUSH_A>, + OpSize16, Requires<[Not64BitMode]>; +} + +let Constraints = "$src = $dst", SchedRW = [WriteALU] in { +// GR32 = bswap GR32 +def BSWAP32r : I<0xC8, AddRegFrm, + (outs GR32:$dst), (ins GR32:$src), + "bswap{l}\t$dst", + [(set GR32:$dst, (bswap GR32:$src))], IIC_BSWAP>, OpSize32, TB; + +def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src), + "bswap{q}\t$dst", + [(set GR64:$dst, (bswap GR64:$src))], IIC_BSWAP>, TB; +} // Constraints = "$src = $dst", SchedRW + +// Bit scan instructions. +let Defs = [EFLAGS] in { +def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "bsf{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))], + IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>; +def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "bsf{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))], + IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>; +def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "bsf{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))], + IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>; +def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "bsf{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))], + IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>; +def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "bsf{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))], + IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>; +def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "bsf{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))], + IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>; + +def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "bsr{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))], + IIC_BIT_SCAN_REG>, PS, OpSize16, Sched<[WriteShift]>; +def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "bsr{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))], + IIC_BIT_SCAN_MEM>, PS, OpSize16, Sched<[WriteShiftLd]>; +def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "bsr{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))], + IIC_BIT_SCAN_REG>, PS, OpSize32, Sched<[WriteShift]>; +def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "bsr{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))], + IIC_BIT_SCAN_MEM>, PS, OpSize32, Sched<[WriteShiftLd]>; +def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "bsr{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], + IIC_BIT_SCAN_REG>, PS, Sched<[WriteShift]>; +def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "bsr{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))], + IIC_BIT_SCAN_MEM>, PS, Sched<[WriteShiftLd]>; +} // Defs = [EFLAGS] + +let SchedRW = [WriteMicrocoded] in { +// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI +let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in { +def MOVSB : I<0xA4, RawFrmDstSrc, (outs dstidx8:$dst), (ins srcidx8:$src), + "movsb\t{$src, $dst|$dst, $src}", [], IIC_MOVS>; +def MOVSW : I<0xA5, RawFrmDstSrc, (outs dstidx16:$dst), (ins srcidx16:$src), + "movsw\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize16; +def MOVSL : I<0xA5, RawFrmDstSrc, (outs dstidx32:$dst), (ins srcidx32:$src), + "movs{l|d}\t{$src, $dst|$dst, $src}", [], IIC_MOVS>, OpSize32; +def MOVSQ : RI<0xA5, RawFrmDstSrc, (outs dstidx64:$dst), (ins srcidx64:$src), + "movsq\t{$src, $dst|$dst, $src}", [], IIC_MOVS>; +} + +// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI +let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in +def STOSB : I<0xAA, RawFrmDst, (outs dstidx8:$dst), (ins), + "stosb\t{%al, $dst|$dst, al}", [], IIC_STOS>; +let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in +def STOSW : I<0xAB, RawFrmDst, (outs dstidx16:$dst), (ins), + "stosw\t{%ax, $dst|$dst, ax}", [], IIC_STOS>, OpSize16; +let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in +def STOSL : I<0xAB, RawFrmDst, (outs dstidx32:$dst), (ins), + "stos{l|d}\t{%eax, $dst|$dst, eax}", [], IIC_STOS>, OpSize32; +let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in +def STOSQ : RI<0xAB, RawFrmDst, (outs dstidx64:$dst), (ins), + "stosq\t{%rax, $dst|$dst, rax}", [], IIC_STOS>; + +// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI +let Defs = [EDI,EFLAGS], Uses = [AL,EDI,EFLAGS] in +def SCASB : I<0xAE, RawFrmDst, (outs), (ins dstidx8:$dst), + "scasb\t{$dst, %al|al, $dst}", [], IIC_SCAS>; +let Defs = [EDI,EFLAGS], Uses = [AX,EDI,EFLAGS] in +def SCASW : I<0xAF, RawFrmDst, (outs), (ins dstidx16:$dst), + "scasw\t{$dst, %ax|ax, $dst}", [], IIC_SCAS>, OpSize16; +let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,EFLAGS] in +def SCASL : I<0xAF, RawFrmDst, (outs), (ins dstidx32:$dst), + "scas{l|d}\t{$dst, %eax|eax, $dst}", [], IIC_SCAS>, OpSize32; +let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,EFLAGS] in +def SCASQ : RI<0xAF, RawFrmDst, (outs), (ins dstidx64:$dst), + "scasq\t{$dst, %rax|rax, $dst}", [], IIC_SCAS>; + +// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI +let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,EFLAGS] in { +def CMPSB : I<0xA6, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src), + "cmpsb\t{$dst, $src|$src, $dst}", [], IIC_CMPS>; +def CMPSW : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src), + "cmpsw\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize16; +def CMPSL : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src), + "cmps{l|d}\t{$dst, $src|$src, $dst}", [], IIC_CMPS>, OpSize32; +def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src), + "cmpsq\t{$dst, $src|$src, $dst}", [], IIC_CMPS>; +} +} // SchedRW + +//===----------------------------------------------------------------------===// +// Move Instructions. +// +let SchedRW = [WriteMove] in { +let hasSideEffects = 0 in { +def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src), + "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; +def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16; +def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32; +def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; +} + +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { +def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(set GR8:$dst, imm:$src)], IIC_MOV>; +def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize16; +def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, imm:$src)], IIC_MOV>, OpSize32; +def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>; +} +let isReMaterializable = 1 in { +def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src), + "movabs{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, imm:$src)], IIC_MOV>; +} + +// Longer forms that use a ModR/M byte. Needed for disassembler +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { +def MOV8ri_alt : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src), + "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; +def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src), + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16; +def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32; +} +} // SchedRW + +let SchedRW = [WriteStore] in { +def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(store (i8 imm8_su:$src), addr:$dst)], IIC_MOV_MEM>; +def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(store (i16 imm16_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16; +def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32; +def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>; +} // SchedRW + +let hasSideEffects = 0 in { + +/// Memory offset versions of moves. The immediate is an address mode sized +/// offset from the segment base. +let SchedRW = [WriteALU] in { +let mayLoad = 1 in { +let Defs = [AL] in +def MOV8ao32 : Ii32<0xA0, RawFrmMemOffs, (outs), (ins offset32_8:$src), + "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, + AdSize32; +let Defs = [AX] in +def MOV16ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_16:$src), + "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, + OpSize16, AdSize32; +let Defs = [EAX] in +def MOV32ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_32:$src), + "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>, + OpSize32, AdSize32; +let Defs = [RAX] in +def MOV64ao32 : RIi32<0xA1, RawFrmMemOffs, (outs), (ins offset32_64:$src), + "mov{q}\t{$src, %rax|rax, $src}", [], IIC_MOV_MEM>, + AdSize32; + +let Defs = [AL] in +def MOV8ao16 : Ii16<0xA0, RawFrmMemOffs, (outs), (ins offset16_8:$src), + "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, AdSize16; +let Defs = [AX] in +def MOV16ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_16:$src), + "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, + OpSize16, AdSize16; +let Defs = [EAX] in +def MOV32ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_32:$src), + "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>, + AdSize16, OpSize32; +} +let mayStore = 1 in { +let Uses = [AL] in +def MOV8o32a : Ii32<0xA2, RawFrmMemOffs, (outs offset32_8:$dst), (ins), + "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize32; +let Uses = [AX] in +def MOV16o32a : Ii32<0xA3, RawFrmMemOffs, (outs offset32_16:$dst), (ins), + "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, + OpSize16, AdSize32; +let Uses = [EAX] in +def MOV32o32a : Ii32<0xA3, RawFrmMemOffs, (outs offset32_32:$dst), (ins), + "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>, + OpSize32, AdSize32; +let Uses = [RAX] in +def MOV64o32a : RIi32<0xA3, RawFrmMemOffs, (outs offset32_64:$dst), (ins), + "mov{q}\t{%rax, $dst|$dst, rax}", [], IIC_MOV_MEM>, + AdSize32; + +let Uses = [AL] in +def MOV8o16a : Ii16<0xA2, RawFrmMemOffs, (outs offset16_8:$dst), (ins), + "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize16; +let Uses = [AX] in +def MOV16o16a : Ii16<0xA3, RawFrmMemOffs, (outs offset16_16:$dst), (ins), + "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, + OpSize16, AdSize16; +let Uses = [EAX] in +def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs offset16_32:$dst), (ins), + "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>, + OpSize32, AdSize16; +} +} + +// These forms all have full 64-bit absolute addresses in their instructions +// and use the movabs mnemonic to indicate this specific form. +let mayLoad = 1 in { +let Defs = [AL] in +def MOV8ao64 : RIi64_NOREX<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src), + "movabs{b}\t{$src, %al|al, $src}", []>, AdSize64; +let Defs = [AX] in +def MOV16ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src), + "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize16, AdSize64; +let Defs = [EAX] in +def MOV32ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src), + "movabs{l}\t{$src, %eax|eax, $src}", []>, OpSize32, + AdSize64; +let Defs = [RAX] in +def MOV64ao64 : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64_64:$src), + "movabs{q}\t{$src, %rax|rax, $src}", []>, AdSize64; +} + +let mayStore = 1 in { +let Uses = [AL] in +def MOV8o64a : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs offset64_8:$dst), (ins), + "movabs{b}\t{%al, $dst|$dst, al}", []>, AdSize64; +let Uses = [AX] in +def MOV16o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset64_16:$dst), (ins), + "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize16, AdSize64; +let Uses = [EAX] in +def MOV32o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset64_32:$dst), (ins), + "movabs{l}\t{%eax, $dst|$dst, eax}", []>, OpSize32, + AdSize64; +let Uses = [RAX] in +def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs offset64_64:$dst), (ins), + "movabs{q}\t{%rax, $dst|$dst, rax}", []>, AdSize64; +} +} // hasSideEffects = 0 + +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, + SchedRW = [WriteMove] in { +def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src), + "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; +def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16; +def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32; +def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; +} + +let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in { +def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>; +def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (loadi16 addr:$src))], IIC_MOV_MEM>, OpSize16; +def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (loadi32 addr:$src))], IIC_MOV_MEM>, OpSize32; +def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>; +} + +let SchedRW = [WriteStore] in { +def MOV8mr : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(store GR8:$src, addr:$dst)], IIC_MOV_MEM>; +def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(store GR16:$src, addr:$dst)], IIC_MOV_MEM>, OpSize16; +def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(store GR32:$src, addr:$dst)], IIC_MOV_MEM>, OpSize32; +def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(store GR64:$src, addr:$dst)], IIC_MOV_MEM>; +} // SchedRW + +// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so +// that they can be used for copying and storing h registers, which can't be +// encoded when a REX prefix is present. +let isCodeGenOnly = 1 in { +let hasSideEffects = 0 in +def MOV8rr_NOREX : I<0x88, MRMDestReg, + (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src), + "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV>, + Sched<[WriteMove]>; +let mayStore = 1, hasSideEffects = 0 in +def MOV8mr_NOREX : I<0x88, MRMDestMem, + (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src), + "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], + IIC_MOV_MEM>, Sched<[WriteStore]>; +let mayLoad = 1, hasSideEffects = 0, + canFoldAsLoad = 1, isReMaterializable = 1 in +def MOV8rm_NOREX : I<0x8A, MRMSrcMem, + (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src), + "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], + IIC_MOV_MEM>, Sched<[WriteLoad]>; +} + + +// Condition code ops, incl. set if equal/not equal/... +let SchedRW = [WriteALU] in { +let Defs = [EFLAGS], Uses = [AH] in +def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", + [(set EFLAGS, (X86sahf AH))], IIC_AHF>, + Requires<[HasLAHFSAHF]>; +let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in +def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [], + IIC_AHF>, // AH = flags + Requires<[HasLAHFSAHF]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Bit tests instructions: BT, BTS, BTR, BTC. + +let Defs = [EFLAGS] in { +let SchedRW = [WriteALU] in { +def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>, + OpSize16, TB; +def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>, + OpSize32, TB; +def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB; +} // SchedRW + +// Unlike with the register+register form, the memory+register form of the +// bt instruction does not ignore the high bits of the index. From ISel's +// perspective, this is pretty bizarre. Make these instructions disassembly +// only for now. + +let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in { + def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + // [(X86bt (loadi16 addr:$src1), GR16:$src2), + // (implicit EFLAGS)] + [], IIC_BT_MR + >, OpSize16, TB, Requires<[FastBTMem]>; + def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + // [(X86bt (loadi32 addr:$src1), GR32:$src2), + // (implicit EFLAGS)] + [], IIC_BT_MR + >, OpSize32, TB, Requires<[FastBTMem]>; + def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + // [(X86bt (loadi64 addr:$src1), GR64:$src2), + // (implicit EFLAGS)] + [], IIC_BT_MR + >, TB; +} + +let SchedRW = [WriteALU] in { +def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))], + IIC_BT_RI>, OpSize16, TB; +def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))], + IIC_BT_RI>, OpSize32, TB; +def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))], + IIC_BT_RI>, TB; +} // SchedRW + +// Note that these instructions don't need FastBTMem because that +// only applies when the other operand is in a register. When it's +// an immediate, bt is still fast. +let SchedRW = [WriteALU] in { +def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2)) + ], IIC_BT_MI>, OpSize16, TB; +def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2)) + ], IIC_BT_MI>, OpSize32, TB; +def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt (loadi64 addr:$src1), + i64immSExt8:$src2))], IIC_BT_MI>, TB; +} // SchedRW + +let hasSideEffects = 0 in { +let SchedRW = [WriteALU] in { +def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, + OpSize16, TB; +def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, + OpSize32, TB; +def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, + OpSize16, TB; +def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, + OpSize32, TB; +def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; +} + +let SchedRW = [WriteALU] in { +def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, + OpSize16, TB; +def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, + OpSize32, TB; +def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, + OpSize16, TB; +def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, + OpSize32, TB; +def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; +} + +let SchedRW = [WriteALU] in { +def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, + OpSize16, TB; +def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, + OpSize32, TB; +def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, + OpSize16, TB; +def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, + OpSize32, TB; +def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; +} + +let SchedRW = [WriteALU] in { +def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, + OpSize16, TB; +def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, + OpSize32, TB; +def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, + OpSize16, TB; +def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, + OpSize32, TB; +def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; +} + +let SchedRW = [WriteALU] in { +def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, + OpSize16, TB; +def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, + OpSize32, TB; +def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, + OpSize16, TB; +def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, + OpSize32, TB; +def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; +} + +let SchedRW = [WriteALU] in { +def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, + OpSize16, TB; +def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, + OpSize32, TB; +def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, + OpSize16, TB; +def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, + OpSize32, TB; +def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; +} +} // hasSideEffects = 0 +} // Defs = [EFLAGS] + + +//===----------------------------------------------------------------------===// +// Atomic support +// + +// Atomic swap. These are just normal xchg instructions. But since a memory +// operand is referenced, the atomicity is ensured. +multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag, + InstrItinClass itin> { + let Constraints = "$val = $dst", SchedRW = [WriteALULd, WriteRMW] in { + def NAME#8rm : I<opc8, MRMSrcMem, (outs GR8:$dst), + (ins GR8:$val, i8mem:$ptr), + !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"), + [(set + GR8:$dst, + (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))], + itin>; + def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst), + (ins GR16:$val, i16mem:$ptr), + !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"), + [(set + GR16:$dst, + (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))], + itin>, OpSize16; + def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$val, i32mem:$ptr), + !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"), + [(set + GR32:$dst, + (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))], + itin>, OpSize32; + def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$val, i64mem:$ptr), + !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"), + [(set + GR64:$dst, + (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))], + itin>; + } +} + +defm XCHG : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap", IIC_XCHG_MEM>; + +// Swap between registers. +let SchedRW = [WriteALU] in { +let Constraints = "$val = $dst" in { +def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src), + "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>; +def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src), + "xchg{w}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>, + OpSize16; +def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src), + "xchg{l}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>, + OpSize32; +def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src), + "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>; +} + +// Swap between EAX and other registers. +let Uses = [AX], Defs = [AX] in +def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src), + "xchg{w}\t{$src, %ax|ax, $src}", [], IIC_XCHG_REG>, OpSize16; +let Uses = [EAX], Defs = [EAX] in +def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src), + "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>, + OpSize32, Requires<[Not64BitMode]>; +let Uses = [EAX], Defs = [EAX] in +// Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding. +// xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP. +def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src), + "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>, + OpSize32, Requires<[In64BitMode]>; +let Uses = [RAX], Defs = [RAX] in +def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src), + "xchg{q}\t{$src, %rax|rax, $src}", [], IIC_XCHG_REG>; +} // SchedRW + +let SchedRW = [WriteALU] in { +def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src), + "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB; +def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), + "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB, + OpSize16; +def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), + "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB, + OpSize32; +def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), + "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def XADD8rm : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src), + "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB; +def XADD16rm : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB, + OpSize16; +def XADD32rm : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB, + OpSize32; +def XADD64rm : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB; + +} + +let SchedRW = [WriteALU] in { +def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src), + "cmpxchg{b}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_REG8>, TB; +def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), + "cmpxchg{w}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_REG>, TB, OpSize16; +def CMPXCHG32rr : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), + "cmpxchg{l}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_REG>, TB, OpSize32; +def CMPXCHG64rr : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), + "cmpxchg{q}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_REG>, TB; +} // SchedRW + +let SchedRW = [WriteALULd, WriteRMW] in { +let mayLoad = 1, mayStore = 1 in { +def CMPXCHG8rm : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src), + "cmpxchg{b}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_MEM8>, TB; +def CMPXCHG16rm : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "cmpxchg{w}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_MEM>, TB, OpSize16; +def CMPXCHG32rm : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "cmpxchg{l}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_MEM>, TB, OpSize32; +def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "cmpxchg{q}\t{$src, $dst|$dst, $src}", [], + IIC_CMPXCHG_MEM>, TB; +} + +let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in +def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst), + "cmpxchg8b\t$dst", [], IIC_CMPXCHG_8B>, TB; + +let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in +def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst), + "cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>, + TB, Requires<[HasCmpxchg16b]>; +} // SchedRW + + +// Lock instruction prefix +def LOCK_PREFIX : I<0xF0, RawFrm, (outs), (ins), "lock", []>; + +// Rex64 instruction prefix +def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", []>, + Requires<[In64BitMode]>; + +// Data16 instruction prefix +def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", []>; + +// Repeat string operation instruction prefixes +// These uses the DF flag in the EFLAGS register to inc or dec ECX +let Defs = [ECX], Uses = [ECX,EFLAGS] in { +// Repeat (used with INS, OUTS, MOVS, LODS and STOS) +def REP_PREFIX : I<0xF3, RawFrm, (outs), (ins), "rep", []>; +// Repeat while not equal (used with CMPS and SCAS) +def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>; +} + + +// String manipulation instructions +let SchedRW = [WriteMicrocoded] in { +// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI +let Defs = [AL,ESI], Uses = [ESI,EFLAGS] in +def LODSB : I<0xAC, RawFrmSrc, (outs), (ins srcidx8:$src), + "lodsb\t{$src, %al|al, $src}", [], IIC_LODS>; +let Defs = [AX,ESI], Uses = [ESI,EFLAGS] in +def LODSW : I<0xAD, RawFrmSrc, (outs), (ins srcidx16:$src), + "lodsw\t{$src, %ax|ax, $src}", [], IIC_LODS>, OpSize16; +let Defs = [EAX,ESI], Uses = [ESI,EFLAGS] in +def LODSL : I<0xAD, RawFrmSrc, (outs), (ins srcidx32:$src), + "lods{l|d}\t{$src, %eax|eax, $src}", [], IIC_LODS>, OpSize32; +let Defs = [RAX,ESI], Uses = [ESI,EFLAGS] in +def LODSQ : RI<0xAD, RawFrmSrc, (outs), (ins srcidx64:$src), + "lodsq\t{$src, %rax|rax, $src}", [], IIC_LODS>; +} + +let SchedRW = [WriteSystem] in { +// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI +let Defs = [ESI], Uses = [DX,ESI,EFLAGS] in { +def OUTSB : I<0x6E, RawFrmSrc, (outs), (ins srcidx8:$src), + "outsb\t{$src, %dx|dx, $src}", [], IIC_OUTS>; +def OUTSW : I<0x6F, RawFrmSrc, (outs), (ins srcidx16:$src), + "outsw\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize16; +def OUTSL : I<0x6F, RawFrmSrc, (outs), (ins srcidx32:$src), + "outs{l|d}\t{$src, %dx|dx, $src}", [], IIC_OUTS>, OpSize32; +} + +// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI +let Defs = [EDI], Uses = [DX,EDI,EFLAGS] in { +def INSB : I<0x6C, RawFrmDst, (outs dstidx8:$dst), (ins), + "insb\t{%dx, $dst|$dst, dx}", [], IIC_INS>; +def INSW : I<0x6D, RawFrmDst, (outs dstidx16:$dst), (ins), + "insw\t{%dx, $dst|$dst, dx}", [], IIC_INS>, OpSize16; +def INSL : I<0x6D, RawFrmDst, (outs dstidx32:$dst), (ins), + "ins{l|d}\t{%dx, $dst|$dst, dx}", [], IIC_INS>, OpSize32; +} +} + +// Flag instructions +let SchedRW = [WriteALU] in { +def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>; +def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>; +def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>; +def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>; +def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", [], IIC_CLD>; +def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>; +def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>; + +def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB; +} + +// Table lookup instructions +let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in +def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>, + Sched<[WriteLoad]>; + +let SchedRW = [WriteMicrocoded] in { +// ASCII Adjust After Addition +let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in +def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>, + Requires<[Not64BitMode]>; + +// ASCII Adjust AX Before Division +let Uses = [AX], Defs = [AX,EFLAGS], hasSideEffects = 0 in +def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src), + "aad\t$src", [], IIC_AAD>, Requires<[Not64BitMode]>; + +// ASCII Adjust AX After Multiply +let Uses = [AL], Defs = [AX,EFLAGS], hasSideEffects = 0 in +def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src), + "aam\t$src", [], IIC_AAM>, Requires<[Not64BitMode]>; + +// ASCII Adjust AL After Subtraction - sets +let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in +def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>, + Requires<[Not64BitMode]>; + +// Decimal Adjust AL after Addition +let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in +def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>, + Requires<[Not64BitMode]>; + +// Decimal Adjust AL after Subtraction +let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in +def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>, + Requires<[Not64BitMode]>; +} // SchedRW + +let SchedRW = [WriteSystem] in { +// Check Array Index Against Bounds +def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize16, + Requires<[Not64BitMode]>; +def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize32, + Requires<[Not64BitMode]>; + +// Adjust RPL Field of Segment Selector +def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), + "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>, + Requires<[Not64BitMode]>; +def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>, + Requires<[Not64BitMode]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// MOVBE Instructions +// +let Predicates = [HasMOVBE] in { + let SchedRW = [WriteALULd] in { + def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "movbe{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>, + OpSize16, T8PS; + def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "movbe{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (bswap (loadi32 addr:$src)))], IIC_MOVBE>, + OpSize32, T8PS; + def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "movbe{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>, + T8PS; + } + let SchedRW = [WriteStore] in { + def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "movbe{w}\t{$src, $dst|$dst, $src}", + [(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>, + OpSize16, T8PS; + def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "movbe{l}\t{$src, $dst|$dst, $src}", + [(store (bswap GR32:$src), addr:$dst)], IIC_MOVBE>, + OpSize32, T8PS; + def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "movbe{q}\t{$src, $dst|$dst, $src}", + [(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>, + T8PS; + } +} + +//===----------------------------------------------------------------------===// +// RDRAND Instruction +// +let Predicates = [HasRDRAND], Defs = [EFLAGS] in { + def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins), + "rdrand{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86rdrand))]>, OpSize16, TB; + def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins), + "rdrand{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86rdrand))]>, OpSize32, TB; + def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins), + "rdrand{q}\t$dst", + [(set GR64:$dst, EFLAGS, (X86rdrand))]>, TB; +} + +//===----------------------------------------------------------------------===// +// RDSEED Instruction +// +let Predicates = [HasRDSEED], Defs = [EFLAGS] in { + def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins), + "rdseed{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, TB; + def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins), + "rdseed{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, TB; + def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins), + "rdseed{q}\t$dst", + [(set GR64:$dst, EFLAGS, (X86rdseed))]>, TB; +} + +//===----------------------------------------------------------------------===// +// LZCNT Instruction +// +let Predicates = [HasLZCNT], Defs = [EFLAGS] in { + def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "lzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>, XS, + OpSize16; + def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "lzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctlz (loadi16 addr:$src))), + (implicit EFLAGS)]>, XS, OpSize16; + + def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "lzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>, XS, + OpSize32; + def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "lzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctlz (loadi32 addr:$src))), + (implicit EFLAGS)]>, XS, OpSize32; + + def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "lzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>, + XS; + def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "lzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctlz (loadi64 addr:$src))), + (implicit EFLAGS)]>, XS; +} + +let Predicates = [HasLZCNT] in { + def : Pat<(X86cmov (ctlz GR16:$src), (i16 16), (X86_COND_E_OR_NE), + (X86cmp GR16:$src, (i16 0))), + (LZCNT16rr GR16:$src)>; + def : Pat<(X86cmov (ctlz GR32:$src), (i32 32), (X86_COND_E_OR_NE), + (X86cmp GR32:$src, (i32 0))), + (LZCNT32rr GR32:$src)>; + def : Pat<(X86cmov (ctlz GR64:$src), (i64 64), (X86_COND_E_OR_NE), + (X86cmp GR64:$src, (i64 0))), + (LZCNT64rr GR64:$src)>; + def : Pat<(X86cmov (i16 16), (ctlz GR16:$src), (X86_COND_E_OR_NE), + (X86cmp GR16:$src, (i16 0))), + (LZCNT16rr GR16:$src)>; + def : Pat<(X86cmov (i32 32), (ctlz GR32:$src), (X86_COND_E_OR_NE), + (X86cmp GR32:$src, (i32 0))), + (LZCNT32rr GR32:$src)>; + def : Pat<(X86cmov (i64 64), (ctlz GR64:$src), (X86_COND_E_OR_NE), + (X86cmp GR64:$src, (i64 0))), + (LZCNT64rr GR64:$src)>; + + def : Pat<(X86cmov (ctlz (loadi16 addr:$src)), (i16 16), (X86_COND_E_OR_NE), + (X86cmp (loadi16 addr:$src), (i16 0))), + (LZCNT16rm addr:$src)>; + def : Pat<(X86cmov (ctlz (loadi32 addr:$src)), (i32 32), (X86_COND_E_OR_NE), + (X86cmp (loadi32 addr:$src), (i32 0))), + (LZCNT32rm addr:$src)>; + def : Pat<(X86cmov (ctlz (loadi64 addr:$src)), (i64 64), (X86_COND_E_OR_NE), + (X86cmp (loadi64 addr:$src), (i64 0))), + (LZCNT64rm addr:$src)>; + def : Pat<(X86cmov (i16 16), (ctlz (loadi16 addr:$src)), (X86_COND_E_OR_NE), + (X86cmp (loadi16 addr:$src), (i16 0))), + (LZCNT16rm addr:$src)>; + def : Pat<(X86cmov (i32 32), (ctlz (loadi32 addr:$src)), (X86_COND_E_OR_NE), + (X86cmp (loadi32 addr:$src), (i32 0))), + (LZCNT32rm addr:$src)>; + def : Pat<(X86cmov (i64 64), (ctlz (loadi64 addr:$src)), (X86_COND_E_OR_NE), + (X86cmp (loadi64 addr:$src), (i64 0))), + (LZCNT64rm addr:$src)>; +} + +//===----------------------------------------------------------------------===// +// BMI Instructions +// +let Predicates = [HasBMI], Defs = [EFLAGS] in { + def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "tzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>, XS, + OpSize16; + def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "tzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (cttz (loadi16 addr:$src))), + (implicit EFLAGS)]>, XS, OpSize16; + + def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "tzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>, XS, + OpSize32; + def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "tzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (cttz (loadi32 addr:$src))), + (implicit EFLAGS)]>, XS, OpSize32; + + def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "tzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>, + XS; + def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "tzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (cttz (loadi64 addr:$src))), + (implicit EFLAGS)]>, XS; +} + +multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM, + RegisterClass RC, X86MemOperand x86memop> { +let hasSideEffects = 0 in { + def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src), + !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), + []>, T8PS, VEX_4V; + let mayLoad = 1 in + def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src), + !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), + []>, T8PS, VEX_4V; +} +} + +let Predicates = [HasBMI], Defs = [EFLAGS] in { + defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem>; + defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem>, VEX_W; + defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem>; + defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem>, VEX_W; + defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem>; + defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem>, VEX_W; +} + +//===----------------------------------------------------------------------===// +// Pattern fragments to auto generate BMI instructions. +//===----------------------------------------------------------------------===// + +let Predicates = [HasBMI] in { + // FIXME: patterns for the load versions are not implemented + def : Pat<(and GR32:$src, (add GR32:$src, -1)), + (BLSR32rr GR32:$src)>; + def : Pat<(and GR64:$src, (add GR64:$src, -1)), + (BLSR64rr GR64:$src)>; + + def : Pat<(xor GR32:$src, (add GR32:$src, -1)), + (BLSMSK32rr GR32:$src)>; + def : Pat<(xor GR64:$src, (add GR64:$src, -1)), + (BLSMSK64rr GR64:$src)>; + + def : Pat<(and GR32:$src, (ineg GR32:$src)), + (BLSI32rr GR32:$src)>; + def : Pat<(and GR64:$src, (ineg GR64:$src)), + (BLSI64rr GR64:$src)>; +} + +let Predicates = [HasBMI] in { + def : Pat<(X86cmov (cttz GR16:$src), (i16 16), (X86_COND_E_OR_NE), + (X86cmp GR16:$src, (i16 0))), + (TZCNT16rr GR16:$src)>; + def : Pat<(X86cmov (cttz GR32:$src), (i32 32), (X86_COND_E_OR_NE), + (X86cmp GR32:$src, (i32 0))), + (TZCNT32rr GR32:$src)>; + def : Pat<(X86cmov (cttz GR64:$src), (i64 64), (X86_COND_E_OR_NE), + (X86cmp GR64:$src, (i64 0))), + (TZCNT64rr GR64:$src)>; + def : Pat<(X86cmov (i16 16), (cttz GR16:$src), (X86_COND_E_OR_NE), + (X86cmp GR16:$src, (i16 0))), + (TZCNT16rr GR16:$src)>; + def : Pat<(X86cmov (i32 32), (cttz GR32:$src), (X86_COND_E_OR_NE), + (X86cmp GR32:$src, (i32 0))), + (TZCNT32rr GR32:$src)>; + def : Pat<(X86cmov (i64 64), (cttz GR64:$src), (X86_COND_E_OR_NE), + (X86cmp GR64:$src, (i64 0))), + (TZCNT64rr GR64:$src)>; + + def : Pat<(X86cmov (cttz (loadi16 addr:$src)), (i16 16), (X86_COND_E_OR_NE), + (X86cmp (loadi16 addr:$src), (i16 0))), + (TZCNT16rm addr:$src)>; + def : Pat<(X86cmov (cttz (loadi32 addr:$src)), (i32 32), (X86_COND_E_OR_NE), + (X86cmp (loadi32 addr:$src), (i32 0))), + (TZCNT32rm addr:$src)>; + def : Pat<(X86cmov (cttz (loadi64 addr:$src)), (i64 64), (X86_COND_E_OR_NE), + (X86cmp (loadi64 addr:$src), (i64 0))), + (TZCNT64rm addr:$src)>; + def : Pat<(X86cmov (i16 16), (cttz (loadi16 addr:$src)), (X86_COND_E_OR_NE), + (X86cmp (loadi16 addr:$src), (i16 0))), + (TZCNT16rm addr:$src)>; + def : Pat<(X86cmov (i32 32), (cttz (loadi32 addr:$src)), (X86_COND_E_OR_NE), + (X86cmp (loadi32 addr:$src), (i32 0))), + (TZCNT32rm addr:$src)>; + def : Pat<(X86cmov (i64 64), (cttz (loadi64 addr:$src)), (X86_COND_E_OR_NE), + (X86cmp (loadi64 addr:$src), (i64 0))), + (TZCNT64rm addr:$src)>; +} + + +multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC, + X86MemOperand x86memop, Intrinsic Int, + PatFrag ld_frag> { + def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>, + T8PS, VEX_4VOp3; + def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)), + (implicit EFLAGS)]>, T8PS, VEX_4VOp3; +} + +let Predicates = [HasBMI], Defs = [EFLAGS] in { + defm BEXTR32 : bmi_bextr_bzhi<0xF7, "bextr{l}", GR32, i32mem, + int_x86_bmi_bextr_32, loadi32>; + defm BEXTR64 : bmi_bextr_bzhi<0xF7, "bextr{q}", GR64, i64mem, + int_x86_bmi_bextr_64, loadi64>, VEX_W; +} + +let Predicates = [HasBMI2], Defs = [EFLAGS] in { + defm BZHI32 : bmi_bextr_bzhi<0xF5, "bzhi{l}", GR32, i32mem, + int_x86_bmi_bzhi_32, loadi32>; + defm BZHI64 : bmi_bextr_bzhi<0xF5, "bzhi{q}", GR64, i64mem, + int_x86_bmi_bzhi_64, loadi64>, VEX_W; +} + + +def CountTrailingOnes : SDNodeXForm<imm, [{ + // Count the trailing ones in the immediate. + return getI8Imm(countTrailingOnes(N->getZExtValue()), SDLoc(N)); +}]>; + +def BZHIMask : ImmLeaf<i64, [{ + return isMask_64(Imm) && (countTrailingOnes<uint64_t>(Imm) > 32); +}]>; + +let Predicates = [HasBMI2] in { + def : Pat<(and GR64:$src, BZHIMask:$mask), + (BZHI64rr GR64:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>; + + def : Pat<(and GR32:$src, (add (shl 1, GR8:$lz), -1)), + (BZHI32rr GR32:$src, + (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; + + def : Pat<(and (loadi32 addr:$src), (add (shl 1, GR8:$lz), -1)), + (BZHI32rm addr:$src, + (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; + + def : Pat<(and GR64:$src, (add (shl 1, GR8:$lz), -1)), + (BZHI64rr GR64:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; + + def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)), + (BZHI64rm addr:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; +} // HasBMI2 + +let Predicates = [HasBMI] in { + def : Pat<(X86bextr GR32:$src1, GR32:$src2), + (BEXTR32rr GR32:$src1, GR32:$src2)>; + def : Pat<(X86bextr (loadi32 addr:$src1), GR32:$src2), + (BEXTR32rm addr:$src1, GR32:$src2)>; + def : Pat<(X86bextr GR64:$src1, GR64:$src2), + (BEXTR64rr GR64:$src1, GR64:$src2)>; + def : Pat<(X86bextr (loadi64 addr:$src1), GR64:$src2), + (BEXTR64rm addr:$src1, GR64:$src2)>; +} // HasBMI + +multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC, + X86MemOperand x86memop, Intrinsic Int, + PatFrag ld_frag> { + def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int RC:$src1, RC:$src2))]>, + VEX_4V; + def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>, VEX_4V; +} + +let Predicates = [HasBMI2] in { + defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem, + int_x86_bmi_pdep_32, loadi32>, T8XD; + defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem, + int_x86_bmi_pdep_64, loadi64>, T8XD, VEX_W; + defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem, + int_x86_bmi_pext_32, loadi32>, T8XS; + defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem, + int_x86_bmi_pext_64, loadi64>, T8XS, VEX_W; +} + +//===----------------------------------------------------------------------===// +// TBM Instructions +// +let Predicates = [HasTBM], Defs = [EFLAGS] in { + +multiclass tbm_ternary_imm_intr<bits<8> opc, RegisterClass RC, string OpcodeStr, + X86MemOperand x86memop, PatFrag ld_frag, + Intrinsic Int, Operand immtype, + SDPatternOperator immoperator> { + def ri : Ii32<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl), + !strconcat(OpcodeStr, + "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"), + [(set RC:$dst, (Int RC:$src1, immoperator:$cntl))]>, + XOP, XOPA; + def mi : Ii32<opc, MRMSrcMem, (outs RC:$dst), + (ins x86memop:$src1, immtype:$cntl), + !strconcat(OpcodeStr, + "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"), + [(set RC:$dst, (Int (ld_frag addr:$src1), immoperator:$cntl))]>, + XOP, XOPA; +} + +defm BEXTRI32 : tbm_ternary_imm_intr<0x10, GR32, "bextr", i32mem, loadi32, + int_x86_tbm_bextri_u32, i32imm, imm>; +let ImmT = Imm32S in +defm BEXTRI64 : tbm_ternary_imm_intr<0x10, GR64, "bextr", i64mem, loadi64, + int_x86_tbm_bextri_u64, i64i32imm, + i64immSExt32>, VEX_W; + +multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem, + RegisterClass RC, string OpcodeStr, + X86MemOperand x86memop, PatFrag ld_frag> { +let hasSideEffects = 0 in { + def rr : I<opc, FormReg, (outs RC:$dst), (ins RC:$src), + !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), + []>, XOP_4V, XOP9; + let mayLoad = 1 in + def rm : I<opc, FormMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), + []>, XOP_4V, XOP9; +} +} + +multiclass tbm_binary_intr<bits<8> opc, string OpcodeStr, + Format FormReg, Format FormMem> { + defm NAME#32 : tbm_binary_rm<opc, FormReg, FormMem, GR32, OpcodeStr, i32mem, + loadi32>; + defm NAME#64 : tbm_binary_rm<opc, FormReg, FormMem, GR64, OpcodeStr, i64mem, + loadi64>, VEX_W; +} + +defm BLCFILL : tbm_binary_intr<0x01, "blcfill", MRM1r, MRM1m>; +defm BLCI : tbm_binary_intr<0x02, "blci", MRM6r, MRM6m>; +defm BLCIC : tbm_binary_intr<0x01, "blcic", MRM5r, MRM5m>; +defm BLCMSK : tbm_binary_intr<0x02, "blcmsk", MRM1r, MRM1m>; +defm BLCS : tbm_binary_intr<0x01, "blcs", MRM3r, MRM3m>; +defm BLSFILL : tbm_binary_intr<0x01, "blsfill", MRM2r, MRM2m>; +defm BLSIC : tbm_binary_intr<0x01, "blsic", MRM6r, MRM6m>; +defm T1MSKC : tbm_binary_intr<0x01, "t1mskc", MRM7r, MRM7m>; +defm TZMSK : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>; +} // HasTBM, EFLAGS + +//===----------------------------------------------------------------------===// +// MONITORX/MWAITX Instructions +// +let SchedRW = [WriteSystem] in { +let Uses = [EAX, ECX, EDX] in +def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", [], + IIC_SSE_MONITOR>, TB; +let Uses = [ECX, EAX, EBX] in +def MWAITXrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx", [], IIC_SSE_MWAIT>, + TB; +} // SchedRW + +def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrr)>, Requires<[Not64BitMode]>; +def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrr)>, Requires<[In64BitMode]>; + +def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORXrrr)>, + Requires<[Not64BitMode]>; +def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>, + Requires<[In64BitMode]>; + +//===----------------------------------------------------------------------===// +// CLZERO Instruction +// +let Uses = [EAX] in +def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, TB; + +//===----------------------------------------------------------------------===// +// Pattern fragments to auto generate TBM instructions. +//===----------------------------------------------------------------------===// + +let Predicates = [HasTBM] in { + def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)), + (BEXTRI32ri GR32:$src1, imm:$src2)>; + def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)), + (BEXTRI32mi addr:$src1, imm:$src2)>; + def : Pat<(X86bextr GR64:$src1, i64immSExt32:$src2), + (BEXTRI64ri GR64:$src1, i64immSExt32:$src2)>; + def : Pat<(X86bextr (loadi64 addr:$src1), i64immSExt32:$src2), + (BEXTRI64mi addr:$src1, i64immSExt32:$src2)>; + + // FIXME: patterns for the load versions are not implemented + def : Pat<(and GR32:$src, (add GR32:$src, 1)), + (BLCFILL32rr GR32:$src)>; + def : Pat<(and GR64:$src, (add GR64:$src, 1)), + (BLCFILL64rr GR64:$src)>; + + def : Pat<(or GR32:$src, (not (add GR32:$src, 1))), + (BLCI32rr GR32:$src)>; + def : Pat<(or GR64:$src, (not (add GR64:$src, 1))), + (BLCI64rr GR64:$src)>; + + // Extra patterns because opt can optimize the above patterns to this. + def : Pat<(or GR32:$src, (sub -2, GR32:$src)), + (BLCI32rr GR32:$src)>; + def : Pat<(or GR64:$src, (sub -2, GR64:$src)), + (BLCI64rr GR64:$src)>; + + def : Pat<(and (not GR32:$src), (add GR32:$src, 1)), + (BLCIC32rr GR32:$src)>; + def : Pat<(and (not GR64:$src), (add GR64:$src, 1)), + (BLCIC64rr GR64:$src)>; + + def : Pat<(xor GR32:$src, (add GR32:$src, 1)), + (BLCMSK32rr GR32:$src)>; + def : Pat<(xor GR64:$src, (add GR64:$src, 1)), + (BLCMSK64rr GR64:$src)>; + + def : Pat<(or GR32:$src, (add GR32:$src, 1)), + (BLCS32rr GR32:$src)>; + def : Pat<(or GR64:$src, (add GR64:$src, 1)), + (BLCS64rr GR64:$src)>; + + def : Pat<(or GR32:$src, (add GR32:$src, -1)), + (BLSFILL32rr GR32:$src)>; + def : Pat<(or GR64:$src, (add GR64:$src, -1)), + (BLSFILL64rr GR64:$src)>; + + def : Pat<(or (not GR32:$src), (add GR32:$src, -1)), + (BLSIC32rr GR32:$src)>; + def : Pat<(or (not GR64:$src), (add GR64:$src, -1)), + (BLSIC64rr GR64:$src)>; + + def : Pat<(or (not GR32:$src), (add GR32:$src, 1)), + (T1MSKC32rr GR32:$src)>; + def : Pat<(or (not GR64:$src), (add GR64:$src, 1)), + (T1MSKC64rr GR64:$src)>; + + def : Pat<(and (not GR32:$src), (add GR32:$src, -1)), + (TZMSK32rr GR32:$src)>; + def : Pat<(and (not GR64:$src), (add GR64:$src, -1)), + (TZMSK64rr GR64:$src)>; +} // HasTBM + +//===----------------------------------------------------------------------===// +// Memory Instructions +// + +def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src), + "clflushopt\t$src", []>, PD; +def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", []>, PD; +def PCOMMIT : I<0xAE, MRM_F8, (outs), (ins), "pcommit", []>, PD; + + +//===----------------------------------------------------------------------===// +// Subsystems. +//===----------------------------------------------------------------------===// + +include "X86InstrArithmetic.td" +include "X86InstrCMovSetCC.td" +include "X86InstrExtension.td" +include "X86InstrControl.td" +include "X86InstrShiftRotate.td" + +// X87 Floating Point Stack. +include "X86InstrFPStack.td" + +// SIMD support (SSE, MMX and AVX) +include "X86InstrFragmentsSIMD.td" + +// FMA - Fused Multiply-Add support (requires FMA) +include "X86InstrFMA.td" + +// XOP +include "X86InstrXOP.td" + +// SSE, MMX and 3DNow! vector support. +include "X86InstrSSE.td" +include "X86InstrAVX512.td" +include "X86InstrMMX.td" +include "X86Instr3DNow.td" + +// MPX instructions +include "X86InstrMPX.td" + +include "X86InstrVMX.td" +include "X86InstrSVM.td" + +include "X86InstrTSX.td" +include "X86InstrSGX.td" + +// System instructions. +include "X86InstrSystem.td" + +// Compiler Pseudo Instructions and Pat Patterns +include "X86InstrCompiler.td" + +//===----------------------------------------------------------------------===// +// Assembler Mnemonic Aliases +//===----------------------------------------------------------------------===// + +def : MnemonicAlias<"call", "callw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"call", "calll", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"call", "callq", "att">, Requires<[In64BitMode]>; + +def : MnemonicAlias<"cbw", "cbtw", "att">; +def : MnemonicAlias<"cwde", "cwtl", "att">; +def : MnemonicAlias<"cwd", "cwtd", "att">; +def : MnemonicAlias<"cdq", "cltd", "att">; +def : MnemonicAlias<"cdqe", "cltq", "att">; +def : MnemonicAlias<"cqo", "cqto", "att">; + +// In 64-bit mode lret maps to lretl; it is not ambiguous with lretq. +def : MnemonicAlias<"lret", "lretw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>; + +def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>; +def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>; + +def : MnemonicAlias<"loopz", "loope">; +def : MnemonicAlias<"loopnz", "loopne">; + +def : MnemonicAlias<"pop", "popw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"pop", "popl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"pop", "popq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"popf", "popfw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"popf", "popfl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"popf", "popfq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"popfd", "popfl", "att">; + +// FIXME: This is wrong for "push reg". "push %bx" should turn into pushw in +// all modes. However: "push (addr)" and "push $42" should default to +// pushl/pushq depending on the current mode. Similar for "pop %bx" +def : MnemonicAlias<"push", "pushw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"push", "pushl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"push", "pushq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"pushf", "pushfw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"pushf", "pushfl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"pushf", "pushfq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"pushfd", "pushfl", "att">; + +def : MnemonicAlias<"popad", "popal", "intel">, Requires<[Not64BitMode]>; +def : MnemonicAlias<"pushad", "pushal", "intel">, Requires<[Not64BitMode]>; +def : MnemonicAlias<"popa", "popaw", "intel">, Requires<[In16BitMode]>; +def : MnemonicAlias<"pusha", "pushaw", "intel">, Requires<[In16BitMode]>; +def : MnemonicAlias<"popa", "popal", "intel">, Requires<[In32BitMode]>; +def : MnemonicAlias<"pusha", "pushal", "intel">, Requires<[In32BitMode]>; + +def : MnemonicAlias<"popa", "popaw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"pusha", "pushaw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"popa", "popal", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"pusha", "pushal", "att">, Requires<[In32BitMode]>; + +def : MnemonicAlias<"repe", "rep">; +def : MnemonicAlias<"repz", "rep">; +def : MnemonicAlias<"repnz", "repne">; + +def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>; + +def : MnemonicAlias<"sal", "shl", "intel">; +def : MnemonicAlias<"salb", "shlb", "att">; +def : MnemonicAlias<"salw", "shlw", "att">; +def : MnemonicAlias<"sall", "shll", "att">; +def : MnemonicAlias<"salq", "shlq", "att">; + +def : MnemonicAlias<"smovb", "movsb", "att">; +def : MnemonicAlias<"smovw", "movsw", "att">; +def : MnemonicAlias<"smovl", "movsl", "att">; +def : MnemonicAlias<"smovq", "movsq", "att">; + +def : MnemonicAlias<"ud2a", "ud2", "att">; +def : MnemonicAlias<"verrw", "verr", "att">; + +// System instruction aliases. +def : MnemonicAlias<"iret", "iretw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"iret", "iretl", "att">, Requires<[Not16BitMode]>; +def : MnemonicAlias<"sysret", "sysretl", "att">; +def : MnemonicAlias<"sysexit", "sysexitl", "att">; + +def : MnemonicAlias<"lgdt", "lgdtw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"lgdt", "lgdtl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"lgdt", "lgdtq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"lidt", "lidtw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"lidt", "lidtl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"lidt", "lidtq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"sgdt", "sgdtw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"sgdt", "sgdtl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"sgdt", "sgdtq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"sidt", "sidtw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"sidt", "sidtl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"sidt", "sidtq", "att">, Requires<[In64BitMode]>; + + +// Floating point stack aliases. +def : MnemonicAlias<"fcmovz", "fcmove", "att">; +def : MnemonicAlias<"fcmova", "fcmovnbe", "att">; +def : MnemonicAlias<"fcmovnae", "fcmovb", "att">; +def : MnemonicAlias<"fcmovna", "fcmovbe", "att">; +def : MnemonicAlias<"fcmovae", "fcmovnb", "att">; +def : MnemonicAlias<"fcomip", "fcompi">; +def : MnemonicAlias<"fildq", "fildll", "att">; +def : MnemonicAlias<"fistpq", "fistpll", "att">; +def : MnemonicAlias<"fisttpq", "fisttpll", "att">; +def : MnemonicAlias<"fldcww", "fldcw", "att">; +def : MnemonicAlias<"fnstcww", "fnstcw", "att">; +def : MnemonicAlias<"fnstsww", "fnstsw", "att">; +def : MnemonicAlias<"fucomip", "fucompi">; +def : MnemonicAlias<"fwait", "wait">; + +def : MnemonicAlias<"fxsaveq", "fxsave64", "att">; +def : MnemonicAlias<"fxrstorq", "fxrstor64", "att">; +def : MnemonicAlias<"xsaveq", "xsave64", "att">; +def : MnemonicAlias<"xrstorq", "xrstor64", "att">; +def : MnemonicAlias<"xsaveoptq", "xsaveopt64", "att">; +def : MnemonicAlias<"xrstorsq", "xrstors64", "att">; +def : MnemonicAlias<"xsavecq", "xsavec64", "att">; +def : MnemonicAlias<"xsavesq", "xsaves64", "att">; + +class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond, + string VariantName> + : MnemonicAlias<!strconcat(Prefix, OldCond, Suffix), + !strconcat(Prefix, NewCond, Suffix), VariantName>; + +/// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of +/// MnemonicAlias's that canonicalize the condition code in a mnemonic, for +/// example "setz" -> "sete". +multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix, + string V = ""> { + def C : CondCodeAlias<Prefix, Suffix, "c", "b", V>; // setc -> setb + def Z : CondCodeAlias<Prefix, Suffix, "z" , "e", V>; // setz -> sete + def NA : CondCodeAlias<Prefix, Suffix, "na", "be", V>; // setna -> setbe + def NB : CondCodeAlias<Prefix, Suffix, "nb", "ae", V>; // setnb -> setae + def NC : CondCodeAlias<Prefix, Suffix, "nc", "ae", V>; // setnc -> setae + def NG : CondCodeAlias<Prefix, Suffix, "ng", "le", V>; // setng -> setle + def NL : CondCodeAlias<Prefix, Suffix, "nl", "ge", V>; // setnl -> setge + def NZ : CondCodeAlias<Prefix, Suffix, "nz", "ne", V>; // setnz -> setne + def PE : CondCodeAlias<Prefix, Suffix, "pe", "p", V>; // setpe -> setp + def PO : CondCodeAlias<Prefix, Suffix, "po", "np", V>; // setpo -> setnp + + def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b", V>; // setnae -> setb + def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a", V>; // setnbe -> seta + def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l", V>; // setnge -> setl + def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g", V>; // setnle -> setg +} + +// Aliases for set<CC> +defm : IntegerCondCodeMnemonicAlias<"set", "">; +// Aliases for j<CC> +defm : IntegerCondCodeMnemonicAlias<"j", "">; +// Aliases for cmov<CC>{w,l,q} +defm : IntegerCondCodeMnemonicAlias<"cmov", "w", "att">; +defm : IntegerCondCodeMnemonicAlias<"cmov", "l", "att">; +defm : IntegerCondCodeMnemonicAlias<"cmov", "q", "att">; +// No size suffix for intel-style asm. +defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">; + + +//===----------------------------------------------------------------------===// +// Assembler Instruction Aliases +//===----------------------------------------------------------------------===// + +// aad/aam default to base 10 if no operand is specified. +def : InstAlias<"aad", (AAD8i8 10)>, Requires<[Not64BitMode]>; +def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>; + +// Disambiguate the mem/imm form of bt-without-a-suffix as btl. +// Likewise for btc/btr/bts. +def : InstAlias<"bt {$imm, $mem|$mem, $imm}", + (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>; +def : InstAlias<"btc {$imm, $mem|$mem, $imm}", + (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>; +def : InstAlias<"btr {$imm, $mem|$mem, $imm}", + (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>; +def : InstAlias<"bts {$imm, $mem|$mem, $imm}", + (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>; + +// clr aliases. +def : InstAlias<"clrb $reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>; +def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg), 0>; +def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg), 0>; +def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg), 0>; + +// lods aliases. Accept the destination being omitted because it's implicit +// in the mnemonic, or the mnemonic suffix being omitted because it's implicit +// in the destination. +def : InstAlias<"lodsb $src", (LODSB srcidx8:$src), 0>; +def : InstAlias<"lodsw $src", (LODSW srcidx16:$src), 0>; +def : InstAlias<"lods{l|d} $src", (LODSL srcidx32:$src), 0>; +def : InstAlias<"lodsq $src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>; +def : InstAlias<"lods {$src, %al|al, $src}", (LODSB srcidx8:$src), 0>; +def : InstAlias<"lods {$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>; +def : InstAlias<"lods {$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>; +def : InstAlias<"lods {$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>; + +// stos aliases. Accept the source being omitted because it's implicit in +// the mnemonic, or the mnemonic suffix being omitted because it's implicit +// in the source. +def : InstAlias<"stosb $dst", (STOSB dstidx8:$dst), 0>; +def : InstAlias<"stosw $dst", (STOSW dstidx16:$dst), 0>; +def : InstAlias<"stos{l|d} $dst", (STOSL dstidx32:$dst), 0>; +def : InstAlias<"stosq $dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"stos {%al, $dst|$dst, al}", (STOSB dstidx8:$dst), 0>; +def : InstAlias<"stos {%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>; +def : InstAlias<"stos {%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>; +def : InstAlias<"stos {%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; + +// scas aliases. Accept the destination being omitted because it's implicit +// in the mnemonic, or the mnemonic suffix being omitted because it's implicit +// in the destination. +def : InstAlias<"scasb $dst", (SCASB dstidx8:$dst), 0>; +def : InstAlias<"scasw $dst", (SCASW dstidx16:$dst), 0>; +def : InstAlias<"scas{l|d} $dst", (SCASL dstidx32:$dst), 0>; +def : InstAlias<"scasq $dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"scas {$dst, %al|al, $dst}", (SCASB dstidx8:$dst), 0>; +def : InstAlias<"scas {$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>; +def : InstAlias<"scas {$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>; +def : InstAlias<"scas {$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; + +// div and idiv aliases for explicit A register. +def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r GR8 :$src)>; +def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16r GR16:$src)>; +def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32r GR32:$src)>; +def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64r GR64:$src)>; +def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8m i8mem :$src)>; +def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16m i16mem:$src)>; +def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32m i32mem:$src)>; +def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64m i64mem:$src)>; +def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8r GR8 :$src)>; +def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16r GR16:$src)>; +def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32r GR32:$src)>; +def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64r GR64:$src)>; +def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8m i8mem :$src)>; +def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16m i16mem:$src)>; +def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32m i32mem:$src)>; +def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>; + + + +// Various unary fpstack operations default to operating on on ST1. +// For example, "fxch" -> "fxch %st(1)" +def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>; +def: InstAlias<"fadd", (ADD_FPrST0 ST1), 0>; +def : InstAlias<"fsub{|r}p", (SUBR_FPrST0 ST1), 0>; +def : InstAlias<"fsub{r|}p", (SUB_FPrST0 ST1), 0>; +def : InstAlias<"fmul", (MUL_FPrST0 ST1), 0>; +def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>; +def : InstAlias<"fdiv{|r}p", (DIVR_FPrST0 ST1), 0>; +def : InstAlias<"fdiv{r|}p", (DIV_FPrST0 ST1), 0>; +def : InstAlias<"fxch", (XCH_F ST1), 0>; +def : InstAlias<"fcom", (COM_FST0r ST1), 0>; +def : InstAlias<"fcomp", (COMP_FST0r ST1), 0>; +def : InstAlias<"fcomi", (COM_FIr ST1), 0>; +def : InstAlias<"fcompi", (COM_FIPr ST1), 0>; +def : InstAlias<"fucom", (UCOM_Fr ST1), 0>; +def : InstAlias<"fucomp", (UCOM_FPr ST1), 0>; +def : InstAlias<"fucomi", (UCOM_FIr ST1), 0>; +def : InstAlias<"fucompi", (UCOM_FIPr ST1), 0>; + +// Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op. +// For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate +// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with +// gas. +multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> { + def : InstAlias<!strconcat(Mnemonic, "\t{$op, %st(0)|st(0), $op}"), + (Inst RST:$op), EmitAlias>; + def : InstAlias<!strconcat(Mnemonic, "\t{%st(0), %st(0)|st(0), st(0)}"), + (Inst ST0), EmitAlias>; +} + +defm : FpUnaryAlias<"fadd", ADD_FST0r>; +defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>; +defm : FpUnaryAlias<"fsub", SUB_FST0r>; +defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0>; +defm : FpUnaryAlias<"fsubr", SUBR_FST0r>; +defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>; +defm : FpUnaryAlias<"fmul", MUL_FST0r>; +defm : FpUnaryAlias<"fmulp", MUL_FPrST0>; +defm : FpUnaryAlias<"fdiv", DIV_FST0r>; +defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0>; +defm : FpUnaryAlias<"fdivr", DIVR_FST0r>; +defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>; +defm : FpUnaryAlias<"fcomi", COM_FIr, 0>; +defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>; +defm : FpUnaryAlias<"fcompi", COM_FIPr>; +defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; + + +// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they +// commute. We also allow fdiv[r]p/fsubrp even though they don't commute, +// solely because gas supports it. +def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>; +def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>; +def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>; +def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>; +def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>; +def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>; + +// We accept "fnstsw %eax" even though it only writes %ax. +def : InstAlias<"fnstsw\t{%eax|eax}", (FNSTSW16r)>; +def : InstAlias<"fnstsw\t{%al|al}" , (FNSTSW16r)>; +def : InstAlias<"fnstsw" , (FNSTSW16r)>; + +// lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but +// this is compatible with what GAS does. +def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"ljmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"lcall {*}$dst", (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"ljmp {*}$dst", (FARJMP32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; +def : InstAlias<"ljmp $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; +def : InstAlias<"lcall {*}$dst", (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"ljmp {*}$dst", (FARJMP16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; + +def : InstAlias<"call {*}$dst", (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"jmp {*}$dst", (JMP64m i64mem:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"call {*}$dst", (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>; +def : InstAlias<"jmp {*}$dst", (JMP32m i32mem:$dst), 0>, Requires<[In32BitMode]>; +def : InstAlias<"call {*}$dst", (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"jmp {*}$dst", (JMP16m i16mem:$dst), 0>, Requires<[In16BitMode]>; + + +// "imul <imm>, B" is an alias for "imul <imm>, B, B". +def : InstAlias<"imul{w} {$imm, $r|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>; +def : InstAlias<"imul{w} {$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>; +def : InstAlias<"imul{l} {$imm, $r|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>; +def : InstAlias<"imul{l} {$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>; +def : InstAlias<"imul{q} {$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>; +def : InstAlias<"imul{q} {$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>; + +// inb %dx -> inb %al, %dx +def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>; +def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>; +def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>; +def : InstAlias<"inb\t$port", (IN8ri u8imm:$port), 0>; +def : InstAlias<"inw\t$port", (IN16ri u8imm:$port), 0>; +def : InstAlias<"inl\t$port", (IN32ri u8imm:$port), 0>; + + +// jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp +def : InstAlias<"call $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>; +def : InstAlias<"jmp $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>; +def : InstAlias<"call $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>; +def : InstAlias<"jmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>; +def : InstAlias<"callw $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>; +def : InstAlias<"jmpw $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>; +def : InstAlias<"calll $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>; +def : InstAlias<"jmpl $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>; + +// Force mov without a suffix with a segment and mem to prefer the 'l' form of +// the move. All segment/mem forms are equivalent, this has the shortest +// encoding. +def : InstAlias<"mov {$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>; +def : InstAlias<"mov {$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>; + +// Match 'movq <largeimm>, <reg>' as an alias for movabsq. +def : InstAlias<"movq {$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>; + +// Match 'movq GR64, MMX' as an alias for movd. +def : InstAlias<"movq {$src, $dst|$dst, $src}", + (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>; +def : InstAlias<"movq {$src, $dst|$dst, $src}", + (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>; + +// movsx aliases +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>; + +// movzx aliases +def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>; +// Note: No GR32->GR64 movzx form. + +// outb %dx -> outb %al, %dx +def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>; +def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>; +def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>; +def : InstAlias<"outb\t$port", (OUT8ir u8imm:$port), 0>; +def : InstAlias<"outw\t$port", (OUT16ir u8imm:$port), 0>; +def : InstAlias<"outl\t$port", (OUT32ir u8imm:$port), 0>; + +// 'sldt <mem>' can be encoded with either sldtw or sldtq with the same +// effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity +// errors, since its encoding is the most compact. +def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem), 0>; + +// shld/shrd op,op -> shld op, op, CL +def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>; +def : InstAlias<"shld{l}\t{$r2, $r1|$r1, $r2}", (SHLD32rrCL GR32:$r1, GR32:$r2), 0>; +def : InstAlias<"shld{q}\t{$r2, $r1|$r1, $r2}", (SHLD64rrCL GR64:$r1, GR64:$r2), 0>; +def : InstAlias<"shrd{w}\t{$r2, $r1|$r1, $r2}", (SHRD16rrCL GR16:$r1, GR16:$r2), 0>; +def : InstAlias<"shrd{l}\t{$r2, $r1|$r1, $r2}", (SHRD32rrCL GR32:$r1, GR32:$r2), 0>; +def : InstAlias<"shrd{q}\t{$r2, $r1|$r1, $r2}", (SHRD64rrCL GR64:$r1, GR64:$r2), 0>; + +def : InstAlias<"shld{w}\t{$reg, $mem|$mem, $reg}", (SHLD16mrCL i16mem:$mem, GR16:$reg), 0>; +def : InstAlias<"shld{l}\t{$reg, $mem|$mem, $reg}", (SHLD32mrCL i32mem:$mem, GR32:$reg), 0>; +def : InstAlias<"shld{q}\t{$reg, $mem|$mem, $reg}", (SHLD64mrCL i64mem:$mem, GR64:$reg), 0>; +def : InstAlias<"shrd{w}\t{$reg, $mem|$mem, $reg}", (SHRD16mrCL i16mem:$mem, GR16:$reg), 0>; +def : InstAlias<"shrd{l}\t{$reg, $mem|$mem, $reg}", (SHRD32mrCL i32mem:$mem, GR32:$reg), 0>; +def : InstAlias<"shrd{q}\t{$reg, $mem|$mem, $reg}", (SHRD64mrCL i64mem:$mem, GR64:$reg), 0>; + +/* FIXME: This is disabled because the asm matcher is currently incapable of + * matching a fixed immediate like $1. +// "shl X, $1" is an alias for "shl X". +multiclass ShiftRotateByOneAlias<string Mnemonic, string Opc> { + def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "8r1")) GR8:$op)>; + def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "16r1")) GR16:$op)>; + def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "32r1")) GR32:$op)>; + def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "64r1")) GR64:$op)>; + def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "8m1")) i8mem:$op)>; + def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "16m1")) i16mem:$op)>; + def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "32m1")) i32mem:$op)>; + def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "64m1")) i64mem:$op)>; +} + +defm : ShiftRotateByOneAlias<"rcl", "RCL">; +defm : ShiftRotateByOneAlias<"rcr", "RCR">; +defm : ShiftRotateByOneAlias<"rol", "ROL">; +defm : ShiftRotateByOneAlias<"ror", "ROR">; +FIXME */ + +// test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms. +def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}", + (TEST8rm GR8 :$val, i8mem :$mem), 0>; +def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}", + (TEST16rm GR16:$val, i16mem:$mem), 0>; +def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}", + (TEST32rm GR32:$val, i32mem:$mem), 0>; +def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}", + (TEST64rm GR64:$val, i64mem:$mem), 0>; + +// xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms. +def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}", + (XCHG8rm GR8 :$val, i8mem :$mem), 0>; +def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}", + (XCHG16rm GR16:$val, i16mem:$mem), 0>; +def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}", + (XCHG32rm GR32:$val, i32mem:$mem), 0>; +def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}", + (XCHG64rm GR64:$val, i64mem:$mem), 0>; + +// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms. +def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src), 0>; +def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", + (XCHG32ar GR32:$src), 0>, Requires<[Not64BitMode]>; +def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", + (XCHG32ar64 GR32_NOAX:$src), 0>, Requires<[In64BitMode]>; +def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>; + +// These aliases exist to get the parser to prioritize matching 8-bit +// immediate encodings over matching the implicit ax/eax/rax encodings. By +// explicitly mentioning the A register here, these entries will be ordered +// first due to the more explicit immediate type. +def : InstAlias<"adc{w}\t{$imm, %ax|ax, $imm}", (ADC16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"add{w}\t{$imm, %ax|ax, $imm}", (ADD16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"and{w}\t{$imm, %ax|ax, $imm}", (AND16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"cmp{w}\t{$imm, %ax|ax, $imm}", (CMP16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"or{w}\t{$imm, %ax|ax, $imm}", (OR16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"sbb{w}\t{$imm, %ax|ax, $imm}", (SBB16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"sub{w}\t{$imm, %ax|ax, $imm}", (SUB16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"xor{w}\t{$imm, %ax|ax, $imm}", (XOR16ri8 AX, i16i8imm:$imm), 0>; + +def : InstAlias<"adc{l}\t{$imm, %eax|eax, $imm}", (ADC32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"add{l}\t{$imm, %eax|eax, $imm}", (ADD32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"and{l}\t{$imm, %eax|eax, $imm}", (AND32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"cmp{l}\t{$imm, %eax|eax, $imm}", (CMP32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"or{l}\t{$imm, %eax|eax, $imm}", (OR32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"sbb{l}\t{$imm, %eax|eax, $imm}", (SBB32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"sub{l}\t{$imm, %eax|eax, $imm}", (SUB32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"xor{l}\t{$imm, %eax|eax, $imm}", (XOR32ri8 EAX, i32i8imm:$imm), 0>; + +def : InstAlias<"adc{q}\t{$imm, %rax|rax, $imm}", (ADC64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"add{q}\t{$imm, %rax|rax, $imm}", (ADD64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"and{q}\t{$imm, %rax|rax, $imm}", (AND64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"cmp{q}\t{$imm, %rax|rax, $imm}", (CMP64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"or{q}\t{$imm, %rax|rax, $imm}", (OR64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"sbb{q}\t{$imm, %rax|rax, $imm}", (SBB64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"sub{q}\t{$imm, %rax|rax, $imm}", (SUB64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"xor{q}\t{$imm, %rax|rax, $imm}", (XOR64ri8 RAX, i64i8imm:$imm), 0>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td new file mode 100644 index 0000000..83f9b14 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td @@ -0,0 +1,674 @@ +//===-- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 MMX instruction set, defining the instructions, +// and properties of the instructions which are needed for code generation, +// machine code emission, and analysis. +// +// All instructions that use MMX should be in this file, even if they also use +// SSE. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// MMX Multiclasses +//===----------------------------------------------------------------------===// + +let Sched = WriteVecALU in { +def MMX_INTALU_ITINS : OpndItins< + IIC_MMX_ALU_RR, IIC_MMX_ALU_RM +>; + +def MMX_INTALUQ_ITINS : OpndItins< + IIC_MMX_ALUQ_RR, IIC_MMX_ALUQ_RM +>; + +def MMX_PHADDSUBW : OpndItins< + IIC_MMX_PHADDSUBW_RR, IIC_MMX_PHADDSUBW_RM +>; + +def MMX_PHADDSUBD : OpndItins< + IIC_MMX_PHADDSUBD_RR, IIC_MMX_PHADDSUBD_RM +>; +} + +let Sched = WriteVecLogic in +def MMX_INTALU_ITINS_VECLOGICSCHED : OpndItins< + IIC_MMX_ALU_RR, IIC_MMX_ALU_RM +>; + +let Sched = WriteVecIMul in +def MMX_PMUL_ITINS : OpndItins< + IIC_MMX_PMUL, IIC_MMX_PMUL +>; + +let Sched = WriteVecIMul in { +def MMX_PSADBW_ITINS : OpndItins< + IIC_MMX_PSADBW, IIC_MMX_PSADBW +>; + +def MMX_MISC_FUNC_ITINS : OpndItins< + IIC_MMX_MISC_FUNC_MEM, IIC_MMX_MISC_FUNC_REG +>; +} + +def MMX_SHIFT_ITINS : ShiftOpndItins< + IIC_MMX_SHIFT_RR, IIC_MMX_SHIFT_RM, IIC_MMX_SHIFT_RI +>; + +let Sched = WriteShuffle in { +def MMX_UNPCK_H_ITINS : OpndItins< + IIC_MMX_UNPCK_H_RR, IIC_MMX_UNPCK_H_RM +>; + +def MMX_UNPCK_L_ITINS : OpndItins< + IIC_MMX_UNPCK_L, IIC_MMX_UNPCK_L +>; + +def MMX_PCK_ITINS : OpndItins< + IIC_MMX_PCK_RR, IIC_MMX_PCK_RM +>; + +def MMX_PSHUF_ITINS : OpndItins< + IIC_MMX_PSHUF, IIC_MMX_PSHUF +>; +} // Sched + +let Sched = WriteCvtF2I in { +def MMX_CVT_PD_ITINS : OpndItins< + IIC_MMX_CVT_PD_RR, IIC_MMX_CVT_PD_RM +>; + +def MMX_CVT_PS_ITINS : OpndItins< + IIC_MMX_CVT_PS_RR, IIC_MMX_CVT_PS_RM +>; +} + +let Constraints = "$src1 = $dst" in { + // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic. + // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp. + multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, + OpndItins itins, bit Commutable = 0> { + def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), + (ins VR64:$src1, VR64:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>, + Sched<[itins.Sched]> { + let isCommutable = Commutable; + } + def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), + (ins VR64:$src1, i64mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, + (bitconvert (load_mmx addr:$src2))))], + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; + } + + multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm, + string OpcodeStr, Intrinsic IntId, + Intrinsic IntId2, ShiftOpndItins itins> { + def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), + (ins VR64:$src1, VR64:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>, + Sched<[WriteVecShift]>; + def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), + (ins VR64:$src1, i64mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, + (bitconvert (load_mmx addr:$src2))))], + itins.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>; + def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst), + (ins VR64:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId2 VR64:$src1, imm:$src2))], itins.ri>, + Sched<[WriteVecShift]>; + } +} + +/// Unary MMX instructions requiring SSSE3. +multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr, + Intrinsic IntId64, OpndItins itins> { + def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>, + Sched<[itins.Sched]>; + + def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR64:$dst, + (IntId64 (bitconvert (memopmmx addr:$src))))], + itins.rm>, Sched<[itins.Sched.Folded]>; +} + +/// Binary MMX instructions requiring SSSE3. +let ImmT = NoImm, Constraints = "$src1 = $dst" in { +multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr, + Intrinsic IntId64, OpndItins itins> { + let isCommutable = 0 in + def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), + (ins VR64:$src1, VR64:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>, + Sched<[itins.Sched]>; + def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), + (ins VR64:$src1, i64mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, + (IntId64 VR64:$src1, + (bitconvert (memopmmx addr:$src2))))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} +} + +/// PALIGN MMX instructions (require SSSE3). +multiclass ssse3_palign_mm<string asm, Intrinsic IntId> { + def R64irr : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst), + (ins VR64:$src1, VR64:$src2, u8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>, + Sched<[WriteShuffle]>; + def R64irm : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst), + (ins VR64:$src1, i64mem:$src2, u8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR64:$dst, (IntId VR64:$src1, + (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag, + string asm, OpndItins itins, Domain d> { + def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, + [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>, + Sched<[itins.Sched]>; + def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, + [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>, + Sched<[itins.Sched.Folded]>; +} + +multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC, + RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, + PatFrag ld_frag, string asm, Domain d> { + def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), + (ins DstRC:$src1, SrcRC:$src2), asm, + [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], + NoItinerary, d>, Sched<[WriteCvtI2F]>; + def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), + (ins DstRC:$src1, x86memop:$src2), asm, + [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], + NoItinerary, d>, Sched<[WriteCvtI2FLd]>; +} + +//===----------------------------------------------------------------------===// +// MMX EMMS Instruction +//===----------------------------------------------------------------------===// + +def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms", + [(int_x86_mmx_emms)], IIC_MMX_EMMS>; + +//===----------------------------------------------------------------------===// +// MMX Scalar Instructions +//===----------------------------------------------------------------------===// + +// Data Transfer Instructions +def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (x86mmx (scalar_to_vector GR32:$src)))], + IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>; +def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (x86mmx (scalar_to_vector (loadi32 addr:$src))))], + IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>; + +let Predicates = [HasMMX] in { + let AddedComplexity = 15 in + def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)), + (MMX_MOVD64rr GR32:$src)>; + let AddedComplexity = 20 in + def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))), + (MMX_MOVD64rm addr:$src)>; +} + +let mayStore = 1 in +def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src), + "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>, + Sched<[WriteStore]>; + +def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, + (MMX_X86movd2w (x86mmx VR64:$src)))], + IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>; + +let isBitcast = 1 in +def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, (bitconvert GR64:$src))], + IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>; + +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in +def MMX_MOVD64to64rm : MMXRI<0x6E, MRMSrcMem, (outs VR64:$dst), + (ins i64mem:$src), "movd\t{$src, $dst|$dst, $src}", + [], IIC_MMX_MOVQ_RM>, Sched<[WriteLoad]>; + +// These are 64 bit moves, but since the OS X assembler doesn't +// recognize a register-register movq, we write them as +// movd. +let SchedRW = [WriteMove], isBitcast = 1 in { +def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg, + (outs GR64:$dst), (ins VR64:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, + (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>; +let hasSideEffects = 0 in +def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), + "movq\t{$src, $dst|$dst, $src}", [], + IIC_MMX_MOVQ_RR>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { +def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src), + "movq\t{$src, $dst|$dst, $src}", [], + IIC_MMX_MOVQ_RR>; +} +} // SchedRW + +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in +def MMX_MOVD64from64rm : MMXRI<0x7E, MRMDestMem, + (outs i64mem:$dst), (ins VR64:$src), + "movd\t{$src, $dst|$dst, $src}", + [], IIC_MMX_MOV_REG_MM>, Sched<[WriteStore]>; + +let SchedRW = [WriteLoad] in { +let canFoldAsLoad = 1 in +def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, (load_mmx addr:$src))], + IIC_MMX_MOVQ_RM>; +} // SchedRW +let SchedRW = [WriteStore] in +def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (x86mmx VR64:$src), addr:$dst)], + IIC_MMX_MOVQ_RM>; + +let SchedRW = [WriteMove] in { +def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), + (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (x86mmx (bitconvert + (i64 (extractelt (v2i64 VR128:$src), + (iPTR 0))))))], + IIC_MMX_MOVQ_RR>; + +def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst), + (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 + (scalar_to_vector + (i64 (bitconvert (x86mmx VR64:$src))))))], + IIC_MMX_MOVQ_RR>; + +let isCodeGenOnly = 1, hasSideEffects = 1 in { +def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst), + (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", + [], IIC_MMX_MOVQ_RR>; + +def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), + (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", + [], IIC_MMX_MOVQ_RR>; +} +} // SchedRW + +let Predicates = [HasSSE1] in +def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), + "movntq\t{$src, $dst|$dst, $src}", + [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)], + IIC_MMX_MOVQ_RM>, Sched<[WriteStore]>; + +let Predicates = [HasMMX] in { + let AddedComplexity = 15 in + // movd to MMX register zero-extends + def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))), + (MMX_MOVD64rr GR32:$src)>; + let AddedComplexity = 20 in + def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector (loadi32 addr:$src))))), + (MMX_MOVD64rm addr:$src)>; +} + +// Arithmetic Instructions +defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b, + MMX_INTALU_ITINS>; +defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w, + MMX_INTALU_ITINS>; +defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d, + MMX_INTALU_ITINS>; +// -- Addition +defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b, + MMX_INTALU_ITINS, 1>; +defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w, + MMX_INTALU_ITINS, 1>; +defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d, + MMX_INTALU_ITINS, 1>; +let Predicates = [HasSSE2] in +defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q, + MMX_INTALUQ_ITINS, 1>; +defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, + MMX_INTALU_ITINS, 1>; +defm MMX_PADDSW : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w, + MMX_INTALU_ITINS, 1>; + +defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b, + MMX_INTALU_ITINS, 1>; +defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w, + MMX_INTALU_ITINS, 1>; + +defm MMX_PHADDW : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w, + MMX_PHADDSUBW>; +defm MMX_PHADD : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d, + MMX_PHADDSUBD>; +defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw, + MMX_PHADDSUBW>; + + +// -- Subtraction +defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b, + MMX_INTALU_ITINS>; +defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w, + MMX_INTALU_ITINS>; +defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d, + MMX_INTALU_ITINS>; +let Predicates = [HasSSE2] in +defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q, + MMX_INTALUQ_ITINS>; + +defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b, + MMX_INTALU_ITINS>; +defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w, + MMX_INTALU_ITINS>; + +defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b, + MMX_INTALU_ITINS>; +defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w, + MMX_INTALU_ITINS>; + +defm MMX_PHSUBW : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w, + MMX_PHADDSUBW>; +defm MMX_PHSUBD : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d, + MMX_PHADDSUBD>; +defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw, + MMX_PHADDSUBW>; + +// -- Multiplication +defm MMX_PMULLW : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w, + MMX_PMUL_ITINS, 1>; + +defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w, + MMX_PMUL_ITINS, 1>; +let Predicates = [HasSSE1] in +defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, + MMX_PMUL_ITINS, 1>; +let Predicates = [HasSSE2] in +defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, + MMX_PMUL_ITINS, 1>; +let isCommutable = 1 in +defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw", + int_x86_ssse3_pmul_hr_sw, MMX_PMUL_ITINS>; + +// -- Miscellanea +defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, + MMX_PMUL_ITINS, 1>; + +defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw", + int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>; +let Predicates = [HasSSE1] in { +defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, + MMX_MISC_FUNC_ITINS, 1>; +defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, + MMX_MISC_FUNC_ITINS, 1>; + +defm MMX_PMINUB : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b, + MMX_MISC_FUNC_ITINS, 1>; +defm MMX_PMINSW : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w, + MMX_MISC_FUNC_ITINS, 1>; + +defm MMX_PMAXUB : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b, + MMX_MISC_FUNC_ITINS, 1>; +defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, + MMX_MISC_FUNC_ITINS, 1>; + +defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw, + MMX_PSADBW_ITINS, 1>; +} + +defm MMX_PSIGNB : SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b, + MMX_MISC_FUNC_ITINS>; +defm MMX_PSIGNW : SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w, + MMX_MISC_FUNC_ITINS>; +defm MMX_PSIGND : SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d, + MMX_MISC_FUNC_ITINS>; +let Constraints = "$src1 = $dst" in + defm MMX_PALIGN : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b>; + +// Logical Instructions +defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand, + MMX_INTALU_ITINS_VECLOGICSCHED, 1>; +defm MMX_POR : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por, + MMX_INTALU_ITINS_VECLOGICSCHED, 1>; +defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor, + MMX_INTALU_ITINS_VECLOGICSCHED, 1>; +defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn, + MMX_INTALU_ITINS_VECLOGICSCHED>; + +// Shift Instructions +defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", + int_x86_mmx_psrl_w, int_x86_mmx_psrli_w, + MMX_SHIFT_ITINS>; +defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld", + int_x86_mmx_psrl_d, int_x86_mmx_psrli_d, + MMX_SHIFT_ITINS>; +defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", + int_x86_mmx_psrl_q, int_x86_mmx_psrli_q, + MMX_SHIFT_ITINS>; + +def : Pat<(int_x86_mmx_psrl_w VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSRLWrm VR64:$src1, addr:$src2)>; +def : Pat<(int_x86_mmx_psrl_d VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSRLDrm VR64:$src1, addr:$src2)>; +def : Pat<(int_x86_mmx_psrl_q VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSRLQrm VR64:$src1, addr:$src2)>; + +defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", + int_x86_mmx_psll_w, int_x86_mmx_pslli_w, + MMX_SHIFT_ITINS>; +defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld", + int_x86_mmx_psll_d, int_x86_mmx_pslli_d, + MMX_SHIFT_ITINS>; +defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", + int_x86_mmx_psll_q, int_x86_mmx_pslli_q, + MMX_SHIFT_ITINS>; + +def : Pat<(int_x86_mmx_psll_w VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSLLWrm VR64:$src1, addr:$src2)>; +def : Pat<(int_x86_mmx_psll_d VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSLLDrm VR64:$src1, addr:$src2)>; +def : Pat<(int_x86_mmx_psll_q VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSLLQrm VR64:$src1, addr:$src2)>; + +defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", + int_x86_mmx_psra_w, int_x86_mmx_psrai_w, + MMX_SHIFT_ITINS>; +defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", + int_x86_mmx_psra_d, int_x86_mmx_psrai_d, + MMX_SHIFT_ITINS>; + +def : Pat<(int_x86_mmx_psra_w VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSRAWrm VR64:$src1, addr:$src2)>; +def : Pat<(int_x86_mmx_psra_d VR64:$src1, (load_mvmmx addr:$src2)), + (MMX_PSRADrm VR64:$src1, addr:$src2)>; + +// Comparison Instructions +defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b, + MMX_INTALU_ITINS>; +defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w, + MMX_INTALU_ITINS>; +defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d, + MMX_INTALU_ITINS>; + +defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b, + MMX_INTALU_ITINS>; +defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w, + MMX_INTALU_ITINS>; +defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d, + MMX_INTALU_ITINS>; + +// -- Unpack Instructions +defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw", + int_x86_mmx_punpckhbw, + MMX_UNPCK_H_ITINS>; +defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd", + int_x86_mmx_punpckhwd, + MMX_UNPCK_H_ITINS>; +defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq", + int_x86_mmx_punpckhdq, + MMX_UNPCK_H_ITINS>; +defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw", + int_x86_mmx_punpcklbw, + MMX_UNPCK_L_ITINS>; +defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd", + int_x86_mmx_punpcklwd, + MMX_UNPCK_L_ITINS>; +defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq", + int_x86_mmx_punpckldq, + MMX_UNPCK_L_ITINS>; + +// -- Pack Instructions +defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb, + MMX_PCK_ITINS>; +defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw, + MMX_PCK_ITINS>; +defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb, + MMX_PCK_ITINS>; + +// -- Shuffle Instructions +defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b, + MMX_PSHUF_ITINS>; + +def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg, + (outs VR64:$dst), (ins VR64:$src1, u8imm:$src2), + "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR64:$dst, + (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))], + IIC_MMX_PSHUF>, Sched<[WriteShuffle]>; +def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem, + (outs VR64:$dst), (ins i64mem:$src1, u8imm:$src2), + "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR64:$dst, + (int_x86_sse_pshuf_w (load_mmx addr:$src1), + imm:$src2))], + IIC_MMX_PSHUF>, Sched<[WriteShuffleLd]>; + + + + +// -- Conversion Instructions +defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi, + f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}", + MMX_CVT_PS_ITINS, SSEPackedSingle>, PS; +defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi, + f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}", + MMX_CVT_PD_ITINS, SSEPackedDouble>, PD; +defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi, + f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}", + MMX_CVT_PS_ITINS, SSEPackedSingle>, PS; +defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi, + f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}", + MMX_CVT_PD_ITINS, SSEPackedDouble>, PD; +defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd, + i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}", + MMX_CVT_PD_ITINS, SSEPackedDouble>, PD; +let Constraints = "$src1 = $dst" in { + defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128, + int_x86_sse_cvtpi2ps, + i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}", + SSEPackedSingle>, PS; +} + +// Extract / Insert +let Predicates = [HasSSE1] in +def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg, + (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2), + "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1, + imm:$src2))], + IIC_MMX_PEXTR>, Sched<[WriteShuffle]>; +let Constraints = "$src1 = $dst" in { +let Predicates = [HasSSE1] in { + def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg, + (outs VR64:$dst), + (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3), + "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, + GR32orGR64:$src2, imm:$src3))], + IIC_MMX_PINSRW>, Sched<[WriteShuffle]>; + + def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem, + (outs VR64:$dst), + (ins VR64:$src1, i16mem:$src2, i32u8imm:$src3), + "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, + (i32 (anyext (loadi16 addr:$src2))), + imm:$src3))], + IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>; +} +} + +// Mask creation +let Predicates = [HasSSE1] in +def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), + (ins VR64:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", + [(set GR32orGR64:$dst, + (int_x86_mmx_pmovmskb VR64:$src))]>; + + +// Low word of XMM to MMX. +def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1, + [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>; + +def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)), + (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>; + +def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))), + (x86mmx (MMX_MOVQ64rm addr:$src))>; + +// Misc. +let SchedRW = [WriteShuffle] in { +let Uses = [EDI], Predicates = [HasSSE1,Not64BitMode] in +def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), + "maskmovq\t{$mask, $src|$src, $mask}", + [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)], + IIC_MMX_MASKMOV>; +let Uses = [RDI], Predicates = [HasSSE1,In64BitMode] in +def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), + "maskmovq\t{$mask, $src|$src, $mask}", + [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)], + IIC_MMX_MASKMOV>; +} + +// 64-bit bit convert. +let Predicates = [HasSSE2] in { +def : Pat<(f64 (bitconvert (x86mmx VR64:$src))), + (MMX_MOVQ2FR64rr VR64:$src)>; +def : Pat<(x86mmx (bitconvert (f64 FR64:$src))), + (MMX_MOVFR642Qrr FR64:$src)>; +} + + diff --git a/contrib/llvm/lib/Target/X86/X86InstrMPX.td b/contrib/llvm/lib/Target/X86/X86InstrMPX.td new file mode 100644 index 0000000..31608cd --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrMPX.td @@ -0,0 +1,70 @@ +//===-- X86InstrMPX.td - MPX Instruction Set ---------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 MPX instruction set, defining the +// instructions, and properties of the instructions which are needed for code +// generation, machine code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> { + def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins i32mem:$src), + OpcodeStr#" \t{$src, $dst|$dst, $src}", []>, + Requires<[HasMPX, Not64BitMode]>; + def 64rm: RI<opc, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src), + OpcodeStr#" \t{$src, $dst|$dst, $src}", []>, + Requires<[HasMPX, In64BitMode]>; +} + +defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS; + +multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> { + def 32rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, i32mem:$src2), + OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>, + Requires<[HasMPX, Not64BitMode]>; + def 64rm: RI<opc, MRMSrcMem, (outs), (ins BNDR:$src1, i64mem:$src2), + OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>, + Requires<[HasMPX, In64BitMode]>; + def 32rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR32:$src2), + OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>, + Requires<[HasMPX, Not64BitMode]>; + def 64rr: RI<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR64:$src2), + OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>, + Requires<[HasMPX, In64BitMode]>; +} +defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS; +defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD; +defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD; + +def BNDMOVRMrr : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src), + "bndmov \t{$src, $dst|$dst, $src}", []>, PD, + Requires<[HasMPX]>; +def BNDMOVRM32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src), + "bndmov \t{$src, $dst|$dst, $src}", []>, PD, + Requires<[HasMPX, Not64BitMode]>; +def BNDMOVRM64rm : RI<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src), + "bndmov \t{$src, $dst|$dst, $src}", []>, PD, + Requires<[HasMPX, In64BitMode]>; + +def BNDMOVMRrr : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src), + "bndmov \t{$src, $dst|$dst, $src}", []>, PD, + Requires<[HasMPX]>; +def BNDMOVMR32mr : I<0x1B, MRMDestMem, (outs i64mem:$dst), (ins BNDR:$src), + "bndmov \t{$src, $dst|$dst, $src}", []>, PD, + Requires<[HasMPX, Not64BitMode]>; +def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs i128mem:$dst), (ins BNDR:$src), + "bndmov \t{$src, $dst|$dst, $src}", []>, PD, + Requires<[HasMPX, In64BitMode]>; + +def BNDSTXmr: I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src), + "bndstx \t{$src, $dst|$dst, $src}", []>, PS, + Requires<[HasMPX]>; +def BNDLDXrm: I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src), + "bndldx \t{$src, $dst|$dst, $src}", []>, PS, + Requires<[HasMPX]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrSGX.td b/contrib/llvm/lib/Target/X86/X86InstrSGX.td new file mode 100644 index 0000000..84119ad --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrSGX.td @@ -0,0 +1,24 @@ +//===-- X86InstrSGX.td - SGX Instruction Set Extension -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the Intel SGX instruction +// set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// SGX instructions + +// ENCLS - Execute an Enclave System Function of Specified Leaf Number +def ENCLS : I<0x01, MRM_CF, (outs), (ins), + "encls", []>, TB; + +// ENCLU - Execute an Enclave User Function of Specified Leaf Number +def ENCLU : I<0x01, MRM_D7, (outs), (ins), + "enclu", []>, TB; diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td new file mode 100644 index 0000000..624b931 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td @@ -0,0 +1,8944 @@ +//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 SSE instruction set, defining the instructions, +// and properties of the instructions which are needed for code generation, +// machine code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> { + InstrItinClass rr = arg_rr; + InstrItinClass rm = arg_rm; + // InstrSchedModel info. + X86FoldableSchedWrite Sched = WriteFAdd; +} + +class SizeItins<OpndItins arg_s, OpndItins arg_d> { + OpndItins s = arg_s; + OpndItins d = arg_d; +} + + +class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm, + InstrItinClass arg_ri> { + InstrItinClass rr = arg_rr; + InstrItinClass rm = arg_rm; + InstrItinClass ri = arg_ri; +} + + +// scalar +let Sched = WriteFAdd in { +def SSE_ALU_F32S : OpndItins< + IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM +>; + +def SSE_ALU_F64S : OpndItins< + IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM +>; +} + +def SSE_ALU_ITINS_S : SizeItins< + SSE_ALU_F32S, SSE_ALU_F64S +>; + +let Sched = WriteFMul in { +def SSE_MUL_F32S : OpndItins< + IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM +>; + +def SSE_MUL_F64S : OpndItins< + IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM +>; +} + +def SSE_MUL_ITINS_S : SizeItins< + SSE_MUL_F32S, SSE_MUL_F64S +>; + +let Sched = WriteFDiv in { +def SSE_DIV_F32S : OpndItins< + IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM +>; + +def SSE_DIV_F64S : OpndItins< + IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM +>; +} + +def SSE_DIV_ITINS_S : SizeItins< + SSE_DIV_F32S, SSE_DIV_F64S +>; + +// parallel +let Sched = WriteFAdd in { +def SSE_ALU_F32P : OpndItins< + IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM +>; + +def SSE_ALU_F64P : OpndItins< + IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM +>; +} + +def SSE_ALU_ITINS_P : SizeItins< + SSE_ALU_F32P, SSE_ALU_F64P +>; + +let Sched = WriteFMul in { +def SSE_MUL_F32P : OpndItins< + IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM +>; + +def SSE_MUL_F64P : OpndItins< + IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM +>; +} + +def SSE_MUL_ITINS_P : SizeItins< + SSE_MUL_F32P, SSE_MUL_F64P +>; + +let Sched = WriteFDiv in { +def SSE_DIV_F32P : OpndItins< + IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM +>; + +def SSE_DIV_F64P : OpndItins< + IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM +>; +} + +def SSE_DIV_ITINS_P : SizeItins< + SSE_DIV_F32P, SSE_DIV_F64P +>; + +let Sched = WriteVecLogic in +def SSE_VEC_BIT_ITINS_P : OpndItins< + IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM +>; + +def SSE_BIT_ITINS_P : OpndItins< + IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM +>; + +let Sched = WriteVecALU in { +def SSE_INTALU_ITINS_P : OpndItins< + IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM +>; + +def SSE_INTALUQ_ITINS_P : OpndItins< + IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM +>; +} + +let Sched = WriteVecIMul in +def SSE_INTMUL_ITINS_P : OpndItins< + IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM +>; + +def SSE_INTSHIFT_ITINS_P : ShiftOpndItins< + IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI +>; + +def SSE_MOVA_ITINS : OpndItins< + IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM +>; + +def SSE_MOVU_ITINS : OpndItins< + IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM +>; + +def SSE_DPPD_ITINS : OpndItins< + IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM +>; + +def SSE_DPPS_ITINS : OpndItins< + IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM +>; + +def DEFAULT_ITINS : OpndItins< + IIC_ALU_NONMEM, IIC_ALU_MEM +>; + +def SSE_EXTRACT_ITINS : OpndItins< + IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM +>; + +def SSE_INSERT_ITINS : OpndItins< + IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM +>; + +let Sched = WriteMPSAD in +def SSE_MPSADBW_ITINS : OpndItins< + IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM +>; + +let Sched = WriteVecIMul in +def SSE_PMULLD_ITINS : OpndItins< + IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM +>; + +// Definitions for backward compatibility. +// The instructions mapped on these definitions uses a different itinerary +// than the actual scheduling model. +let Sched = WriteShuffle in +def DEFAULT_ITINS_SHUFFLESCHED : OpndItins< + IIC_ALU_NONMEM, IIC_ALU_MEM +>; + +let Sched = WriteVecIMul in +def DEFAULT_ITINS_VECIMULSCHED : OpndItins< + IIC_ALU_NONMEM, IIC_ALU_MEM +>; + +let Sched = WriteShuffle in +def SSE_INTALU_ITINS_SHUFF_P : OpndItins< + IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM +>; + +let Sched = WriteMPSAD in +def DEFAULT_ITINS_MPSADSCHED : OpndItins< + IIC_ALU_NONMEM, IIC_ALU_MEM +>; + +let Sched = WriteFBlend in +def DEFAULT_ITINS_FBLENDSCHED : OpndItins< + IIC_ALU_NONMEM, IIC_ALU_MEM +>; + +let Sched = WriteBlend in +def DEFAULT_ITINS_BLENDSCHED : OpndItins< + IIC_ALU_NONMEM, IIC_ALU_MEM +>; + +let Sched = WriteVarBlend in +def DEFAULT_ITINS_VARBLENDSCHED : OpndItins< + IIC_ALU_NONMEM, IIC_ALU_MEM +>; + +let Sched = WriteFBlend in +def SSE_INTALU_ITINS_FBLEND_P : OpndItins< + IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM +>; + +let Sched = WriteBlend in +def SSE_INTALU_ITINS_BLEND_P : OpndItins< + IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM +>; + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 Instructions Classes +//===----------------------------------------------------------------------===// + +/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class +multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, + RegisterClass RC, X86MemOperand x86memop, + Domain d, OpndItins itins, bit Is2Addr = 1> { + let isCommutable = 1 in { + def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr, d>, + Sched<[itins.Sched]>; + } + def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm, d>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class +multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC, + string asm, string SSEVer, string FPSizeStr, + Operand memopr, ComplexPattern mem_cpat, + Domain d, OpndItins itins, bit Is2Addr = 1> { +let isCodeGenOnly = 1 in { + def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr)) + RC:$src1, RC:$src2))], itins.rr, d>, + Sched<[itins.Sched]>; + def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse", + SSEVer, "_", OpcodeStr, FPSizeStr)) + RC:$src1, mem_cpat:$src2))], itins.rm, d>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} +} + +/// sse12_fp_packed - SSE 1 & 2 packed instructions class +multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, + RegisterClass RC, ValueType vt, + X86MemOperand x86memop, PatFrag mem_frag, + Domain d, OpndItins itins, bit Is2Addr = 1> { + let isCommutable = 1 in + def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>, + Sched<[itins.Sched]>; + let mayLoad = 1 in + def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], + itins.rm, d>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class +multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, + string OpcodeStr, X86MemOperand x86memop, + list<dag> pat_rr, list<dag> pat_rm, + bit Is2Addr = 1> { + let isCommutable = 1, hasSideEffects = 0 in + def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + pat_rr, NoItinerary, d>, + Sched<[WriteVecLogic]>; + def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + pat_rm, NoItinerary, d>, + Sched<[WriteVecLogicLd, ReadAfterLd]>; +} + +//===----------------------------------------------------------------------===// +// Non-instruction patterns +//===----------------------------------------------------------------------===// + +// A vector extract of the first f32/f64 position is a subregister copy +def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), + (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>; +def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), + (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>; + +// A 128-bit subvector extract from the first 256-bit vector position +// is a subregister copy that needs no instruction. +def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))), + (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>; +def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))), + (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>; + +def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))), + (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>; +def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))), + (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>; + +def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))), + (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>; +def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))), + (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>; + +// A 128-bit subvector insert to the first 256-bit vector position +// is a subregister copy that needs no instruction. +let AddedComplexity = 25 in { // to give priority over vinsertf128rm +def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)), + (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)), + (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)), + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)), + (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)), + (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)), + (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +} + +// Implicitly promote a 32-bit scalar to a vector. +def : Pat<(v4f32 (scalar_to_vector FR32:$src)), + (COPY_TO_REGCLASS FR32:$src, VR128)>; +def : Pat<(v8f32 (scalar_to_vector FR32:$src)), + (COPY_TO_REGCLASS FR32:$src, VR128)>; +// Implicitly promote a 64-bit scalar to a vector. +def : Pat<(v2f64 (scalar_to_vector FR64:$src)), + (COPY_TO_REGCLASS FR64:$src, VR128)>; +def : Pat<(v4f64 (scalar_to_vector FR64:$src)), + (COPY_TO_REGCLASS FR64:$src, VR128)>; + +// Bitcasts between 128-bit vector types. Return the original type since +// no instruction is needed for the conversion +let Predicates = [HasSSE2] in { + def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(f128 (bitconvert (i128 FR128:$src))), (f128 FR128:$src)>; + def : Pat<(i128 (bitconvert (f128 FR128:$src))), (i128 FR128:$src)>; +} + +// Bitcasts between 256-bit vector types. Return the original type since +// no instruction is needed for the conversion +let Predicates = [HasAVX] in { + def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>; + def : Pat<(v4f64 (bitconvert (v8i32 VR256:$src))), (v4f64 VR256:$src)>; + def : Pat<(v4f64 (bitconvert (v4i64 VR256:$src))), (v4f64 VR256:$src)>; + def : Pat<(v4f64 (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>; + def : Pat<(v4f64 (bitconvert (v32i8 VR256:$src))), (v4f64 VR256:$src)>; + def : Pat<(v8f32 (bitconvert (v8i32 VR256:$src))), (v8f32 VR256:$src)>; + def : Pat<(v8f32 (bitconvert (v4i64 VR256:$src))), (v8f32 VR256:$src)>; + def : Pat<(v8f32 (bitconvert (v4f64 VR256:$src))), (v8f32 VR256:$src)>; + def : Pat<(v8f32 (bitconvert (v32i8 VR256:$src))), (v8f32 VR256:$src)>; + def : Pat<(v8f32 (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>; + def : Pat<(v4i64 (bitconvert (v8f32 VR256:$src))), (v4i64 VR256:$src)>; + def : Pat<(v4i64 (bitconvert (v8i32 VR256:$src))), (v4i64 VR256:$src)>; + def : Pat<(v4i64 (bitconvert (v4f64 VR256:$src))), (v4i64 VR256:$src)>; + def : Pat<(v4i64 (bitconvert (v32i8 VR256:$src))), (v4i64 VR256:$src)>; + def : Pat<(v4i64 (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>; + def : Pat<(v32i8 (bitconvert (v4f64 VR256:$src))), (v32i8 VR256:$src)>; + def : Pat<(v32i8 (bitconvert (v4i64 VR256:$src))), (v32i8 VR256:$src)>; + def : Pat<(v32i8 (bitconvert (v8f32 VR256:$src))), (v32i8 VR256:$src)>; + def : Pat<(v32i8 (bitconvert (v8i32 VR256:$src))), (v32i8 VR256:$src)>; + def : Pat<(v32i8 (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>; + def : Pat<(v8i32 (bitconvert (v32i8 VR256:$src))), (v8i32 VR256:$src)>; + def : Pat<(v8i32 (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>; + def : Pat<(v8i32 (bitconvert (v8f32 VR256:$src))), (v8i32 VR256:$src)>; + def : Pat<(v8i32 (bitconvert (v4i64 VR256:$src))), (v8i32 VR256:$src)>; + def : Pat<(v8i32 (bitconvert (v4f64 VR256:$src))), (v8i32 VR256:$src)>; + def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))), (v16i16 VR256:$src)>; + def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))), (v16i16 VR256:$src)>; + def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))), (v16i16 VR256:$src)>; + def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))), (v16i16 VR256:$src)>; + def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))), (v16i16 VR256:$src)>; +} + +// Alias instructions that map fld0 to xorps for sse or vxorps for avx. +// This is expanded by ExpandPostRAPseudos. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, SchedRW = [WriteZero] in { + def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", + [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>; + def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", + [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>; +} + +//===----------------------------------------------------------------------===// +// AVX & SSE - Zero/One Vectors +//===----------------------------------------------------------------------===// + +// Alias instruction that maps zero vector to pxor / xorp* for sse. +// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then +// swizzled by ExecutionDepsFix to pxor. +// We set canFoldAsLoad because this can be converted to a constant-pool +// load of an all-zeros value if folding it would be beneficial. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, SchedRW = [WriteZero] in { +def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", + [(set VR128:$dst, (v4f32 immAllZerosV))]>; +} + +def : Pat<(v2f64 immAllZerosV), (V_SET0)>; +def : Pat<(v4i32 immAllZerosV), (V_SET0)>; +def : Pat<(v2i64 immAllZerosV), (V_SET0)>; +def : Pat<(v8i16 immAllZerosV), (V_SET0)>; +def : Pat<(v16i8 immAllZerosV), (V_SET0)>; + + +// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI, +// and doesn't need it because on sandy bridge the register is set to zero +// at the rename stage without using any execution unit, so SET0PSY +// and SET0PDY can be used for vector int instructions without penalty +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, Predicates = [HasAVX], SchedRW = [WriteZero] in { +def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v8f32 immAllZerosV))]>; +} + +let Predicates = [HasAVX] in + def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>; + +let Predicates = [HasAVX2] in { + def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>; + def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>; + def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>; + def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>; +} + +// AVX1 has no support for 256-bit integer instructions, but since the 128-bit +// VPXOR instruction writes zero to its upper part, it's safe build zeros. +let Predicates = [HasAVX1Only] in { +def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>; +def : Pat<(bc_v32i8 (v8f32 immAllZerosV)), + (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>; + +def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>; +def : Pat<(bc_v16i16 (v8f32 immAllZerosV)), + (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>; + +def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>; +def : Pat<(bc_v8i32 (v8f32 immAllZerosV)), + (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>; + +def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>; +def : Pat<(bc_v4i64 (v8f32 immAllZerosV)), + (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>; +} + +// We set canFoldAsLoad because this can be converted to a constant-pool +// load of an all-ones value if folding it would be beneficial. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, SchedRW = [WriteZero] in { + def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", + [(set VR128:$dst, (v4i32 immAllOnesV))]>; + let Predicates = [HasAVX2] in + def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v8i32 immAllOnesV))]>; +} + + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move FP Scalar Instructions +// +// Move Instructions. Register-to-register movss/movsd is not used for FR32/64 +// register copies because it's a partial register update; Register-to-register +// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires +// that the insert be implementable in terms of a copy, and just mentioned, we +// don't use movss/movsd for copies. +//===----------------------------------------------------------------------===// + +multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, + X86MemOperand x86memop, string base_opc, + string asm_opr, Domain d = GenericDomain> { + def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, RC:$src2), + !strconcat(base_opc, asm_opr), + [(set VR128:$dst, (vt (OpNode VR128:$src1, + (scalar_to_vector RC:$src2))))], + IIC_SSE_MOV_S_RR, d>, Sched<[WriteFShuffle]>; + + // For the disassembler + let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in + def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src1, RC:$src2), + !strconcat(base_opc, asm_opr), + [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>; +} + +multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, + X86MemOperand x86memop, string OpcodeStr, + Domain d = GenericDomain> { + // AVX + defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>, + VEX_4V, VEX_LIG; + + def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>, + VEX, VEX_LIG, Sched<[WriteStore]>; + // SSE1 & 2 + let Constraints = "$src1 = $dst" in { + defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr, + "\t{$src2, $dst|$dst, $src2}", d>; + } + + def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>, + Sched<[WriteStore]>; +} + +// Loading from memory automatically zeroing upper bits. +multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop, + PatFrag mem_pat, string OpcodeStr, + Domain d = GenericDomain> { + def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (mem_pat addr:$src))], + IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>; + def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (mem_pat addr:$src))], + IIC_SSE_MOV_S_RM, d>, Sched<[WriteLoad]>; +} + +defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", + SSEPackedSingle>, XS; +defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", + SSEPackedDouble>, XD; + +let canFoldAsLoad = 1, isReMaterializable = 1 in { + defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss", + SSEPackedSingle>, XS; + + let AddedComplexity = 20 in + defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd", + SSEPackedDouble>, XD; +} + +// Patterns +let Predicates = [UseAVX] in { + let AddedComplexity = 20 in { + // MOVSSrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), + (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; + def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), + (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; + def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), + (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; + + // MOVSDrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), + (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), + (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), + (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), + (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (X86vzload addr:$src)), + (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; + + // Represent the same patterns above but in the form they appear for + // 256-bit types + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; + } + + // Extract and store. + def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), + addr:$dst), + (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>; + def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), + addr:$dst), + (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>; + + // Shuffle with VMOVSS + def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), + (VMOVSSrr (v4i32 VR128:$src1), + (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>; + def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), + (VMOVSSrr (v4f32 VR128:$src1), + (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>; + + // 256-bit variants + def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm), + (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)), + sub_xmm)>; + def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm), + (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)), + sub_xmm)>; + + // Shuffle with VMOVSD + def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + + // 256-bit variants + def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm), + (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)), + sub_xmm)>; + def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)), + (SUBREG_TO_REG (i32 0), + (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm), + (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)), + sub_xmm)>; + + // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem + // is during lowering, where it's not possible to recognize the fold cause + // it has two uses through a bitcast. One use disappears at isel time and the + // fold opportunity reappears. + def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; +} + +let Predicates = [UseSSE1] in { + let Predicates = [NoSSE41], AddedComplexity = 15 in { + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVSS to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), + (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>; + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; + } + + let AddedComplexity = 20 in { + // MOVSSrm already zeros the high parts of the register. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), + (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; + def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), + (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; + def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), + (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; + } + + // Extract and store. + def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), + addr:$dst), + (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>; + + // Shuffle with MOVSS + def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), + (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>; + def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), + (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>; +} + +let Predicates = [UseSSE2] in { + let Predicates = [NoSSE41], AddedComplexity = 15 in { + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVSD to the lower bits. + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), + (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>; + } + + let AddedComplexity = 20 in { + // MOVSDrm already zeros the high parts of the register. + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), + (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), + (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), + (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), + (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (X86vzload addr:$src)), + (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; + } + + // Extract and store. + def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), + addr:$dst), + (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>; + + // Shuffle with MOVSD + def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + + // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem + // is during lowering, where it's not possible to recognize the fold because + // it has two uses through a bitcast. One use disappears at isel time and the + // fold opportunity reappears. + def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions +//===----------------------------------------------------------------------===// + +multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, + X86MemOperand x86memop, PatFrag ld_frag, + string asm, Domain d, + OpndItins itins, + bit IsReMaterializable = 1> { +let hasSideEffects = 0 in + def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>, + Sched<[WriteFShuffle]>; +let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in + def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>, + Sched<[WriteLoad]>; +} + +let Predicates = [HasAVX, NoVLX] in { +defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, + "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, + PS, VEX; +defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, + "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, + PD, VEX; +defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, + "movups", SSEPackedSingle, SSE_MOVU_ITINS>, + PS, VEX; +defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, + "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, + PD, VEX; + +defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, + "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, + PS, VEX, VEX_L; +defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, + "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, + PD, VEX, VEX_L; +defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, + "movups", SSEPackedSingle, SSE_MOVU_ITINS>, + PS, VEX, VEX_L; +defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, + "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, + PD, VEX, VEX_L; +} + +let Predicates = [UseSSE1] in { +defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, + "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, + PS; +defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, + "movups", SSEPackedSingle, SSE_MOVU_ITINS>, + PS; +} +let Predicates = [UseSSE2] in { +defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, + "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, + PD; +defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, + "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, + PD; +} + +let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX] in { +def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(alignedstore (v4f32 VR128:$src), addr:$dst)], + IIC_SSE_MOVA_P_MR>, VEX; +def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(alignedstore (v2f64 VR128:$src), addr:$dst)], + IIC_SSE_MOVA_P_MR>, VEX; +def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movups\t{$src, $dst|$dst, $src}", + [(store (v4f32 VR128:$src), addr:$dst)], + IIC_SSE_MOVU_P_MR>, VEX; +def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movupd\t{$src, $dst|$dst, $src}", + [(store (v2f64 VR128:$src), addr:$dst)], + IIC_SSE_MOVU_P_MR>, VEX; +def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(alignedstore256 (v8f32 VR256:$src), addr:$dst)], + IIC_SSE_MOVA_P_MR>, VEX, VEX_L; +def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(alignedstore256 (v4f64 VR256:$src), addr:$dst)], + IIC_SSE_MOVA_P_MR>, VEX, VEX_L; +def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), + "movups\t{$src, $dst|$dst, $src}", + [(store (v8f32 VR256:$src), addr:$dst)], + IIC_SSE_MOVU_P_MR>, VEX, VEX_L; +def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), + "movupd\t{$src, $dst|$dst, $src}", + [(store (v4f64 VR256:$src), addr:$dst)], + IIC_SSE_MOVU_P_MR>, VEX, VEX_L; +} // SchedRW + +// For disassembler +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, + SchedRW = [WriteFShuffle] in { + def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src), + "movaps\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, VEX; + def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src), + "movapd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, VEX; + def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src), + "movups\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>, VEX; + def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src), + "movupd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>, VEX; + def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), + (ins VR256:$src), + "movaps\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, VEX, VEX_L; + def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), + (ins VR256:$src), + "movapd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, VEX, VEX_L; + def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), + (ins VR256:$src), + "movups\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>, VEX, VEX_L; + def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), + (ins VR256:$src), + "movupd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>, VEX, VEX_L; +} + +def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src), + (VMOVUPSYmr addr:$dst, VR256:$src)>; +def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src), + (VMOVUPDYmr addr:$dst, VR256:$src)>; + +let SchedRW = [WriteStore] in { +def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(alignedstore (v4f32 VR128:$src), addr:$dst)], + IIC_SSE_MOVA_P_MR>; +def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(alignedstore (v2f64 VR128:$src), addr:$dst)], + IIC_SSE_MOVA_P_MR>; +def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movups\t{$src, $dst|$dst, $src}", + [(store (v4f32 VR128:$src), addr:$dst)], + IIC_SSE_MOVU_P_MR>; +def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movupd\t{$src, $dst|$dst, $src}", + [(store (v2f64 VR128:$src), addr:$dst)], + IIC_SSE_MOVU_P_MR>; +} // SchedRW + +// For disassembler +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, + SchedRW = [WriteFShuffle] in { + def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movaps\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>; + def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movapd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>; + def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movups\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>; + def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movupd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>; +} + +let Predicates = [HasAVX] in { + def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src), + (VMOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src), + (VMOVUPDmr addr:$dst, VR128:$src)>; +} + +let Predicates = [UseSSE1] in + def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src), + (MOVUPSmr addr:$dst, VR128:$src)>; +let Predicates = [UseSSE2] in + def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src), + (MOVUPDmr addr:$dst, VR128:$src)>; + +// Use vmovaps/vmovups for AVX integer load/store. +let Predicates = [HasAVX, NoVLX] in { + // 128-bit load/store + def : Pat<(alignedloadv2i64 addr:$src), + (VMOVAPSrm addr:$src)>; + def : Pat<(loadv2i64 addr:$src), + (VMOVUPSrm addr:$src)>; + + def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), + (VMOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), + (VMOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), + (VMOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), + (VMOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v2i64 VR128:$src), addr:$dst), + (VMOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v4i32 VR128:$src), addr:$dst), + (VMOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8i16 VR128:$src), addr:$dst), + (VMOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v16i8 VR128:$src), addr:$dst), + (VMOVUPSmr addr:$dst, VR128:$src)>; + + // 256-bit load/store + def : Pat<(alignedloadv4i64 addr:$src), + (VMOVAPSYrm addr:$src)>; + def : Pat<(loadv4i64 addr:$src), + (VMOVUPSYrm addr:$src)>; + def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v4i64 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v8i32 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v16i16 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v32i8 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; + + // Special patterns for storing subvector extracts of lower 128-bits + // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr + def : Pat<(alignedstore (v2f64 (extract_subvector + (v4f64 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v4f32 (extract_subvector + (v8f32 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v2i64 (extract_subvector + (v4i64 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v4i32 (extract_subvector + (v8i32 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v8i16 (extract_subvector + (v16i16 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v16i8 (extract_subvector + (v32i8 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + + def : Pat<(store (v2f64 (extract_subvector + (v4f64 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v4f32 (extract_subvector + (v8f32 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v2i64 (extract_subvector + (v4i64 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v4i32 (extract_subvector + (v8i32 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v8i16 (extract_subvector + (v16i16 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v16i8 (extract_subvector + (v32i8 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; +} + +// Use movaps / movups for SSE integer load / store (one byte shorter). +// The instructions selected below are then converted to MOVDQA/MOVDQU +// during the SSE domain pass. +let Predicates = [UseSSE1] in { + def : Pat<(alignedloadv2i64 addr:$src), + (MOVAPSrm addr:$src)>; + def : Pat<(loadv2i64 addr:$src), + (MOVUPSrm addr:$src)>; + + def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v2i64 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v4i32 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8i16 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v16i8 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; +} + +// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper +// bits are disregarded. FIXME: Set encoding to pseudo! +let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in { +let isCodeGenOnly = 1 in { + def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (alignedloadfsf32 addr:$src))], + IIC_SSE_MOVA_P_RM>, VEX; + def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (alignedloadfsf64 addr:$src))], + IIC_SSE_MOVA_P_RM>, VEX; + def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (alignedloadfsf32 addr:$src))], + IIC_SSE_MOVA_P_RM>; + def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (alignedloadfsf64 addr:$src))], + IIC_SSE_MOVA_P_RM>; +} +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move Low packed FP Instructions +//===----------------------------------------------------------------------===// + +multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode, + string base_opc, string asm_opr, + InstrItinClass itin> { + def PSrm : PI<opc, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), + !strconcat(base_opc, "s", asm_opr), + [(set VR128:$dst, + (psnode VR128:$src1, + (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))], + itin, SSEPackedSingle>, PS, + Sched<[WriteFShuffleLd, ReadAfterLd]>; + + def PDrm : PI<opc, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), + !strconcat(base_opc, "d", asm_opr), + [(set VR128:$dst, (v2f64 (pdnode VR128:$src1, + (scalar_to_vector (loadf64 addr:$src2)))))], + itin, SSEPackedDouble>, PD, + Sched<[WriteFShuffleLd, ReadAfterLd]>; + +} + +multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode, + string base_opc, InstrItinClass itin> { + let Predicates = [UseAVX] in + defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", + itin>, VEX_4V; + + let Constraints = "$src1 = $dst" in + defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, + "\t{$src2, $dst|$dst, $src2}", + itin>; +} + +let AddedComplexity = 20 in { + defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp", + IIC_SSE_MOV_LH>; +} + +let SchedRW = [WriteStore] in { +let Predicates = [UseAVX] in { +def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movlps\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>, VEX; +def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movlpd\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt (v2f64 VR128:$src), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>, VEX; +}// UseAVX +def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movlps\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>; +def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movlpd\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt (v2f64 VR128:$src), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>; +} // SchedRW + +let Predicates = [UseAVX] in { + // Shuffle with VMOVLPS + def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))), + (VMOVLPSrm VR128:$src1, addr:$src2)>; + def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))), + (VMOVLPSrm VR128:$src1, addr:$src2)>; + + // Shuffle with VMOVLPD + def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))), + (VMOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), + (VMOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, + (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (VMOVLPDrm VR128:$src1, addr:$src2)>; + + // Store patterns + def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), + addr:$src1), + (VMOVLPSmr addr:$src1, VR128:$src2)>; + def : Pat<(store (v4i32 (X86Movlps + (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1), + (VMOVLPSmr addr:$src1, VR128:$src2)>; + def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)), + addr:$src1), + (VMOVLPDmr addr:$src1, VR128:$src2)>; + def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)), + addr:$src1), + (VMOVLPDmr addr:$src1, VR128:$src2)>; +} + +let Predicates = [UseSSE1] in { + // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS + def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)), + (iPTR 0))), addr:$src1), + (MOVLPSmr addr:$src1, VR128:$src2)>; + + // Shuffle with MOVLPS + def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))), + (MOVLPSrm VR128:$src1, addr:$src2)>; + def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))), + (MOVLPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Movlps VR128:$src1, + (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), + (MOVLPSrm VR128:$src1, addr:$src2)>; + + // Store patterns + def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), + addr:$src1), + (MOVLPSmr addr:$src1, VR128:$src2)>; + def : Pat<(store (v4i32 (X86Movlps + (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), + addr:$src1), + (MOVLPSmr addr:$src1, VR128:$src2)>; +} + +let Predicates = [UseSSE2] in { + // Shuffle with MOVLPD + def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))), + (MOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), + (MOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, + (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (MOVLPDrm VR128:$src1, addr:$src2)>; + + // Store patterns + def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)), + addr:$src1), + (MOVLPDmr addr:$src1, VR128:$src2)>; + def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)), + addr:$src1), + (MOVLPDmr addr:$src1, VR128:$src2)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move Hi packed FP Instructions +//===----------------------------------------------------------------------===// + +let AddedComplexity = 20 in { + defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp", + IIC_SSE_MOV_LH>; +} + +let SchedRW = [WriteStore] in { +// v2f64 extract element 1 is always custom lowered to unpack high to low +// and extract element 0 so the non-store version isn't too horrible. +let Predicates = [UseAVX] in { +def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movhps\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt + (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), + (bc_v2f64 (v4f32 VR128:$src))), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; +def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movhpd\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt + (v2f64 (X86Unpckh VR128:$src, VR128:$src)), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; +} // UseAVX +def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movhps\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt + (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), + (bc_v2f64 (v4f32 VR128:$src))), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; +def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movhpd\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt + (v2f64 (X86Unpckh VR128:$src, VR128:$src)), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; +} // SchedRW + +let Predicates = [UseAVX] in { + // VMOVHPS patterns + def : Pat<(X86Movlhps VR128:$src1, + (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), + (VMOVHPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Movlhps VR128:$src1, + (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), + (VMOVHPSrm VR128:$src1, addr:$src2)>; + + // VMOVHPD patterns + + // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem + // is during lowering, where it's not possible to recognize the load fold + // cause it has two uses through a bitcast. One use disappears at isel time + // and the fold opportunity reappears. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (scalar_to_vector (loadf64 addr:$src2)))), + (VMOVHPDrm VR128:$src1, addr:$src2)>; + // Also handle an i64 load because that may get selected as a faster way to + // load the data. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), + (VMOVHPDrm VR128:$src1, addr:$src2)>; + + def : Pat<(store (f64 (extractelt + (v2f64 (X86VPermilpi VR128:$src, (i8 1))), + (iPTR 0))), addr:$dst), + (VMOVHPDmr addr:$dst, VR128:$src)>; +} + +let Predicates = [UseSSE1] in { + // MOVHPS patterns + def : Pat<(X86Movlhps VR128:$src1, + (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), + (MOVHPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Movlhps VR128:$src1, + (bc_v4f32 (v2i64 (X86vzload addr:$src2)))), + (MOVHPSrm VR128:$src1, addr:$src2)>; +} + +let Predicates = [UseSSE2] in { + // MOVHPD patterns + + // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem + // is during lowering, where it's not possible to recognize the load fold + // cause it has two uses through a bitcast. One use disappears at isel time + // and the fold opportunity reappears. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (scalar_to_vector (loadf64 addr:$src2)))), + (MOVHPDrm VR128:$src1, addr:$src2)>; + // Also handle an i64 load because that may get selected as a faster way to + // load the data. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), + (MOVHPDrm VR128:$src1, addr:$src2)>; + + def : Pat<(store (f64 (extractelt + (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), + (iPTR 0))), addr:$dst), + (MOVHPDmr addr:$dst, VR128:$src)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions +//===----------------------------------------------------------------------===// + +let AddedComplexity = 20, Predicates = [UseAVX] in { + def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))], + IIC_SSE_MOV_LH>, + VEX_4V, Sched<[WriteFShuffle]>; + def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))], + IIC_SSE_MOV_LH>, + VEX_4V, Sched<[WriteFShuffle]>; +} +let Constraints = "$src1 = $dst", AddedComplexity = 20 in { + def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + "movlhps\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))], + IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; + def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + "movhlps\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))], + IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; +} + +let Predicates = [UseAVX] in { + // MOVLHPS patterns + def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), + (VMOVLHPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)), + (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>; + + // MOVHLPS patterns + def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)), + (VMOVHLPSrr VR128:$src1, VR128:$src2)>; +} + +let Predicates = [UseSSE1] in { + // MOVLHPS patterns + def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), + (MOVLHPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)), + (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>; + + // MOVHLPS patterns + def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)), + (MOVHLPSrr VR128:$src1, VR128:$src2)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Conversion Instructions +//===----------------------------------------------------------------------===// + +def SSE_CVT_PD : OpndItins< + IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM +>; + +let Sched = WriteCvtI2F in +def SSE_CVT_PS : OpndItins< + IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM +>; + +let Sched = WriteCvtI2F in +def SSE_CVT_Scalar : OpndItins< + IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM +>; + +let Sched = WriteCvtF2I in +def SSE_CVT_SS2SI_32 : OpndItins< + IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM +>; + +let Sched = WriteCvtF2I in +def SSE_CVT_SS2SI_64 : OpndItins< + IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM +>; + +let Sched = WriteCvtF2I in +def SSE_CVT_SD2SI : OpndItins< + IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM +>; + +// FIXME: We probably want to match the rm form only when optimizing for +// size, to avoid false depenendecies (see sse_fp_unop_s for details) +multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, + string asm, OpndItins itins> { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, + [(set DstRC:$dst, (OpNode SrcRC:$src))], + itins.rr>, Sched<[itins.Sched]>; + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, + [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))], + itins.rm>, Sched<[itins.Sched.Folded]>; +} + +multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + X86MemOperand x86memop, string asm, Domain d, + OpndItins itins> { +let hasSideEffects = 0 in { + def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, + [], itins.rr, d>, Sched<[itins.Sched]>; + let mayLoad = 1 in + def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, + [], itins.rm, d>, Sched<[itins.Sched.Folded]>; +} +} + +// FIXME: We probably want to match the rm form only when optimizing for +// size, to avoid false depenendecies (see sse_fp_unop_s for details) +multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + X86MemOperand x86memop, string asm> { +let hasSideEffects = 0, Predicates = [UseAVX] in { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + Sched<[WriteCvtI2F]>; + let mayLoad = 1 in + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), + (ins DstRC:$src1, x86memop:$src), + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + Sched<[WriteCvtI2FLd, ReadAfterLd]>; +} // hasSideEffects = 0 +} + +let Predicates = [UseAVX] in { +defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, + "cvttss2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SS2SI_32>, + XS, VEX, VEX_LIG; +defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, + "cvttss2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SS2SI_64>, + XS, VEX, VEX_W, VEX_LIG; +defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, + "cvttsd2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SD2SI>, + XD, VEX, VEX_LIG; +defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, + "cvttsd2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SD2SI>, + XD, VEX, VEX_W, VEX_LIG; + +def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>; +def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>; +def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>; +def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>; +def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>; +def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>; +def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>; +def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>; +} +// The assembler can recognize rr 64-bit instructions by seeing a rxx +// register, but the same isn't true when only using memory operands, +// provide other assembly "l" and "q" forms to address this explicitly +// where appropriate to do so. +defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">, + XS, VEX_4V, VEX_LIG; +defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">, + XS, VEX_4V, VEX_W, VEX_LIG; +defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, + XD, VEX_4V, VEX_LIG; +defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, + XD, VEX_4V, VEX_W, VEX_LIG; + +let Predicates = [UseAVX] in { + def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>; + def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>; + + def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), + (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; + def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), + (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>; + def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))), + (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; + def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))), + (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>; + + def : Pat<(f32 (sint_to_fp GR32:$src)), + (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>; + def : Pat<(f32 (sint_to_fp GR64:$src)), + (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>; + def : Pat<(f64 (sint_to_fp GR32:$src)), + (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; + def : Pat<(f64 (sint_to_fp GR64:$src)), + (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>; +} + +defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, + "cvttss2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SS2SI_32>, XS; +defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, + "cvttss2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SS2SI_64>, XS, REX_W; +defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, + "cvttsd2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SD2SI>, XD; +defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, + "cvttsd2si\t{$src, $dst|$dst, $src}", + SSE_CVT_SD2SI>, XD, REX_W; +defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32, + "cvtsi2ss{l}\t{$src, $dst|$dst, $src}", + SSE_CVT_Scalar>, XS; +defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64, + "cvtsi2ss{q}\t{$src, $dst|$dst, $src}", + SSE_CVT_Scalar>, XS, REX_W; +defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32, + "cvtsi2sd{l}\t{$src, $dst|$dst, $src}", + SSE_CVT_Scalar>, XD; +defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64, + "cvtsi2sd{q}\t{$src, $dst|$dst, $src}", + SSE_CVT_Scalar>, XD, REX_W; + +def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>; +def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>; +def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>; +def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>; +def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>; +def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>; +def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>; +def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>; + +def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", + (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>; +def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", + (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>; + +// Conversion Instructions Intrinsics - Match intrinsics which expect MM +// and/or XMM operand(s). + +// FIXME: We probably want to match the rm form only when optimizing for +// size, to avoid false depenendecies (see sse_fp_unop_s for details) +multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + Intrinsic Int, Operand memop, ComplexPattern mem_cpat, + string asm, OpndItins itins> { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>, + Sched<[itins.Sched]>; + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>, + Sched<[itins.Sched.Folded]>; +} + +multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, + RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, + PatFrag ld_frag, string asm, OpndItins itins, + bit Is2Addr = 1> { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], + itins.rr>, Sched<[itins.Sched]>; + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), + (ins DstRC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +let Predicates = [UseAVX] in { +defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, + int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si", + SSE_CVT_SD2SI>, XD, VEX, VEX_LIG; +defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, + int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si", + SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG; +} +defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, + sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD; +defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64, + sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W; + + +let isCodeGenOnly = 1 in { + let Predicates = [UseAVX] in { + defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}", + SSE_CVT_Scalar, 0>, XS, VEX_4V; + defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}", + SSE_CVT_Scalar, 0>, XS, VEX_4V, + VEX_W; + defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}", + SSE_CVT_Scalar, 0>, XD, VEX_4V; + defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}", + SSE_CVT_Scalar, 0>, XD, + VEX_4V, VEX_W; + } + let Constraints = "$src1 = $dst" in { + defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse_cvtsi2ss, i32mem, loadi32, + "cvtsi2ss{l}", SSE_CVT_Scalar>, XS; + defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse_cvtsi642ss, i64mem, loadi64, + "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W; + defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse2_cvtsi2sd, i32mem, loadi32, + "cvtsi2sd{l}", SSE_CVT_Scalar>, XD; + defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse2_cvtsi642sd, i64mem, loadi64, + "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W; + } +} // isCodeGenOnly = 1 + +/// SSE 1 Only + +// Aliases for intrinsics +let isCodeGenOnly = 1 in { +let Predicates = [UseAVX] in { +defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, + ssmem, sse_load_f32, "cvttss2si", + SSE_CVT_SS2SI_32>, XS, VEX; +defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, + int_x86_sse_cvttss2si64, ssmem, sse_load_f32, + "cvttss2si", SSE_CVT_SS2SI_64>, + XS, VEX, VEX_W; +defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, + sdmem, sse_load_f64, "cvttsd2si", + SSE_CVT_SD2SI>, XD, VEX; +defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, + int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, + "cvttsd2si", SSE_CVT_SD2SI>, + XD, VEX, VEX_W; +} +defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, + ssmem, sse_load_f32, "cvttss2si", + SSE_CVT_SS2SI_32>, XS; +defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, + int_x86_sse_cvttss2si64, ssmem, sse_load_f32, + "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W; +defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, + sdmem, sse_load_f64, "cvttsd2si", + SSE_CVT_SD2SI>, XD; +defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, + int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, + "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W; +} // isCodeGenOnly = 1 + +let Predicates = [UseAVX] in { +defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, + ssmem, sse_load_f32, "cvtss2si", + SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG; +defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, + ssmem, sse_load_f32, "cvtss2si", + SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG; +} +defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, + ssmem, sse_load_f32, "cvtss2si", + SSE_CVT_SS2SI_32>, XS; +defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, + ssmem, sse_load_f32, "cvtss2si", + SSE_CVT_SS2SI_64>, XS, REX_W; + +defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem, + "vcvtdq2ps\t{$src, $dst|$dst, $src}", + SSEPackedSingle, SSE_CVT_PS>, + PS, VEX, Requires<[HasAVX]>; +defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, i256mem, + "vcvtdq2ps\t{$src, $dst|$dst, $src}", + SSEPackedSingle, SSE_CVT_PS>, + PS, VEX, VEX_L, Requires<[HasAVX]>; + +defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem, + "cvtdq2ps\t{$src, $dst|$dst, $src}", + SSEPackedSingle, SSE_CVT_PS>, + PS, Requires<[UseSSE2]>; + +let Predicates = [UseAVX] in { +def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>; +def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>; +def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>; +def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>; +def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>; +def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>; +def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>; +def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>; +} + +def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", + (CVTSS2SIrr GR32:$dst, VR128:$src), 0>; +def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", + (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>; +def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTSD2SIrr GR32:$dst, VR128:$src), 0>; +def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>; +def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", + (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>; +def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", + (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>; +def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>; +def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTSD2SI64rm GR64:$dst, sdmem:$src)>; + +/// SSE 2 Only + +// Convert scalar double to scalar single +let hasSideEffects = 0, Predicates = [UseAVX] in { +def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), + (ins FR64:$src1, FR64:$src2), + "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], + IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG, + Sched<[WriteCvtF2F]>; +let mayLoad = 1 in +def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), + (ins FR64:$src1, f64mem:$src2), + "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [], IIC_SSE_CVT_Scalar_RM>, + XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG, + Sched<[WriteCvtF2FLd, ReadAfterLd]>; +} + +def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>, + Requires<[UseAVX]>; + +def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), + "cvtsd2ss\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (fround FR64:$src))], + IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>; +def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), + "cvtsd2ss\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (fround (loadf64 addr:$src)))], + IIC_SSE_CVT_Scalar_RM>, + XD, + Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>; + +let isCodeGenOnly = 1 in { +def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], + IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>, + Sched<[WriteCvtF2F]>; +def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), + "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (int_x86_sse2_cvtsd2ss + VR128:$src1, sse_load_f64:$src2))], + IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>, + Sched<[WriteCvtF2FLd, ReadAfterLd]>; + +let Constraints = "$src1 = $dst" in { +def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "cvtsd2ss\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], + IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>, + Sched<[WriteCvtF2F]>; +def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), + "cvtsd2ss\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_cvtsd2ss + VR128:$src1, sse_load_f64:$src2))], + IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>, + Sched<[WriteCvtF2FLd, ReadAfterLd]>; +} +} // isCodeGenOnly = 1 + +// Convert scalar single to scalar double +// SSE2 instructions with XS prefix +let hasSideEffects = 0, Predicates = [UseAVX] in { +def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), + (ins FR32:$src1, FR32:$src2), + "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [], IIC_SSE_CVT_Scalar_RR>, + XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG, + Sched<[WriteCvtF2F]>; +let mayLoad = 1 in +def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), + (ins FR32:$src1, f32mem:$src2), + "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [], IIC_SSE_CVT_Scalar_RM>, + XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>, + Sched<[WriteCvtF2FLd, ReadAfterLd]>; +} + +def : Pat<(f64 (fextend FR32:$src)), + (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>; +def : Pat<(fextend (loadf32 addr:$src)), + (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>; + +def : Pat<(extloadf32 addr:$src), + (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[UseAVX, OptForSize]>; +def : Pat<(extloadf32 addr:$src), + (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, + Requires<[UseAVX, OptForSpeed]>; + +def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), + "cvtss2sd\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (fextend FR32:$src))], + IIC_SSE_CVT_Scalar_RR>, XS, + Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>; +def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), + "cvtss2sd\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (extloadf32 addr:$src))], + IIC_SSE_CVT_Scalar_RM>, XS, + Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>; + +// extload f32 -> f64. This matches load+fextend because we have a hack in +// the isel (PreprocessForFPConvert) that can introduce loads after dag +// combine. +// Since these loads aren't folded into the fextend, we have to match it +// explicitly here. +def : Pat<(fextend (loadf32 addr:$src)), + (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>; +def : Pat<(extloadf32 addr:$src), + (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>; + +let isCodeGenOnly = 1 in { +def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))], + IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>, + Sched<[WriteCvtF2F]>; +def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), + "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))], + IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>, + Sched<[WriteCvtF2FLd, ReadAfterLd]>; +let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix +def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "cvtss2sd\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))], + IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>, + Sched<[WriteCvtF2F]>; +def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), + "cvtss2sd\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))], + IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>, + Sched<[WriteCvtF2FLd, ReadAfterLd]>; +} +} // isCodeGenOnly = 1 + +// Convert packed single/double fp to doubleword +def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))], + IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>; +def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; +def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_cvt_ps2dq_256 VR256:$src))], + IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; +def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; +def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))], + IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>; +def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; + + +// Convert Packed Double FP to Packed DW Integers +let Predicates = [HasAVX] in { +// The assembler can recognize rr 256-bit instructions by seeing a ymm +// register, but the same isn't true when using memory operands instead. +// Provide other assembly rr and rm forms to address this explicitly. +def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>, + VEX, Sched<[WriteCvtF2I]>; + +// XMM only +def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", + (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>; +def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "vcvtpd2dqx\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvtpd2dq (loadv2f64 addr:$src)))]>, VEX, + Sched<[WriteCvtF2ILd]>; + +// YMM only +def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L, + Sched<[WriteCvtF2I]>; +def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), + "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>, + VEX, VEX_L, Sched<[WriteCvtF2ILd]>; +def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}", + (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>; +} + +def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))], + IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>; +def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))], + IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>; + +// Convert with truncation packed single/double fp to doubleword +// SSE2 packed instructions with XS prefix +def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvttps2dq VR128:$src))], + IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>; +def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttps2dq + (loadv4f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; +def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_cvtt_ps2dq_256 VR256:$src))], + IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; +def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256 + (loadv8f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, VEX, VEX_L, + Sched<[WriteCvtF2ILd]>; + +def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))], + IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>; +def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))], + IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; + +let Predicates = [HasAVX] in { + def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), + (VCVTDQ2PSrr VR128:$src)>; + def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))), + (VCVTDQ2PSrm addr:$src)>; +} + +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), + (VCVTDQ2PSrr VR128:$src)>; + def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))), + (VCVTDQ2PSrm addr:$src)>; + + def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), + (VCVTTPS2DQrr VR128:$src)>; + def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))), + (VCVTTPS2DQrm addr:$src)>; + + def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))), + (VCVTDQ2PSYrr VR256:$src)>; + def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (loadv4i64 addr:$src)))), + (VCVTDQ2PSYrm addr:$src)>; + + def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))), + (VCVTTPS2DQYrr VR256:$src)>; + def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))), + (VCVTTPS2DQYrm addr:$src)>; +} + +let Predicates = [UseSSE2] in { + def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), + (CVTDQ2PSrr VR128:$src)>; + def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))), + (CVTDQ2PSrm addr:$src)>; + + def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), + (CVTDQ2PSrr VR128:$src)>; + def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))), + (CVTDQ2PSrm addr:$src)>; + + def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), + (CVTTPS2DQrr VR128:$src)>; + def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))), + (CVTTPS2DQrm addr:$src)>; +} + +def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvttpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvttpd2dq VR128:$src))], + IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>; + +// The assembler can recognize rr 256-bit instructions by seeing a ymm +// register, but the same isn't true when using memory operands instead. +// Provide other assembly rr and rm forms to address this explicitly. + +// XMM only +def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", + (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>; +def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvttpd2dqx\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttpd2dq + (loadv2f64 addr:$src)))], + IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>; + +// YMM only +def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_avx_cvtt_pd2dq_256 VR256:$src))], + IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; +def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), + "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))], + IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; +def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}", + (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; + +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), + (VCVTTPD2DQYrr VR256:$src)>; + def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))), + (VCVTTPD2DQYrm addr:$src)>; +} // Predicates = [HasAVX] + +def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvttpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))], + IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>; +def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), + "cvttpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttpd2dq + (memopv2f64 addr:$src)))], + IIC_SSE_CVT_PD_RM>, + Sched<[WriteCvtF2ILd]>; + +// Convert packed single to packed double +let Predicates = [HasAVX] in { + // SSE2 instructions without OpSize prefix +def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], + IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>; +def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))], + IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>; +def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_cvt_ps2_pd_256 VR128:$src))], + IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>; +def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_cvt_ps2_pd_256 (loadv4f32 addr:$src)))], + IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; +} + +let Predicates = [UseSSE2] in { +def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))], + IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>; +def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), + "cvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))], + IIC_SSE_CVT_PD_RM>, PS, Sched<[WriteCvtF2FLd]>; +} + +// Convert Packed DW Integers to Packed Double FP +let Predicates = [HasAVX] in { +let hasSideEffects = 0, mayLoad = 1 in +def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", + []>, VEX, Sched<[WriteCvtI2FLd]>; +def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX, + Sched<[WriteCvtI2F]>; +def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_cvtdq2_pd_256 + (bitconvert (loadv2i64 addr:$src))))]>, VEX, VEX_L, + Sched<[WriteCvtI2FLd]>; +def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L, + Sched<[WriteCvtI2F]>; +} + +let hasSideEffects = 0, mayLoad = 1 in +def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "cvtdq2pd\t{$src, $dst|$dst, $src}", [], + IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>; +def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))], + IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>; + +// AVX register conversion intrinsics +let Predicates = [HasAVX] in { + def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))), + (VCVTDQ2PDrr VR128:$src)>; + def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))), + (VCVTDQ2PDrm addr:$src)>; + + def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))), + (VCVTDQ2PDYrr VR128:$src)>; + def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))), + (VCVTDQ2PDYrm addr:$src)>; +} // Predicates = [HasAVX] + +// SSE2 register conversion intrinsics +let Predicates = [HasSSE2] in { + def : Pat<(v2f64 (X86cvtdq2pd (v4i32 VR128:$src))), + (CVTDQ2PDrr VR128:$src)>; + def : Pat<(v2f64 (X86cvtdq2pd (bc_v4i32 (loadv2i64 addr:$src)))), + (CVTDQ2PDrm addr:$src)>; +} // Predicates = [HasSSE2] + +// Convert packed double to packed single +// The assembler can recognize rr 256-bit instructions by seeing a ymm +// register, but the same isn't true when using memory operands instead. +// Provide other assembly rr and rm forms to address this explicitly. +def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtpd2ps\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))], + IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>; + +// XMM only +def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", + (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>; +def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtpd2psx\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvtpd2ps (loadv2f64 addr:$src)))], + IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>; + +// YMM only +def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_avx_cvt_pd2_ps_256 VR256:$src))], + IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>; +def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), + "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))], + IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; +def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}", + (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>; + +def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtpd2ps\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))], + IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>; +def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtpd2ps\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))], + IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>; + + +// AVX 256-bit register conversion intrinsics +// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below +// whenever possible to avoid declaring two versions of each one. +let Predicates = [HasAVX] in { + def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src), + (VCVTDQ2PSYrr VR256:$src)>; + def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))), + (VCVTDQ2PSYrm addr:$src)>; +} + +let Predicates = [HasAVX, NoVLX] in { + // Match fround and fextend for 128/256-bit conversions + def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), + (VCVTPD2PSrr VR128:$src)>; + def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))), + (VCVTPD2PSXrm addr:$src)>; + def : Pat<(v4f32 (fround (v4f64 VR256:$src))), + (VCVTPD2PSYrr VR256:$src)>; + def : Pat<(v4f32 (fround (loadv4f64 addr:$src))), + (VCVTPD2PSYrm addr:$src)>; + + def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))), + (VCVTPS2PDrr VR128:$src)>; + def : Pat<(v4f64 (fextend (v4f32 VR128:$src))), + (VCVTPS2PDYrr VR128:$src)>; + def : Pat<(v4f64 (extloadv4f32 addr:$src)), + (VCVTPS2PDYrm addr:$src)>; +} + +let Predicates = [UseSSE2] in { + // Match fround and fextend for 128 conversions + def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), + (CVTPD2PSrr VR128:$src)>; + def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))), + (CVTPD2PSrm addr:$src)>; + + def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))), + (CVTPS2PDrr VR128:$src)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Compare Instructions +//===----------------------------------------------------------------------===// + +// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions +multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, + Operand CC, SDNode OpNode, ValueType VT, + PatFrag ld_frag, string asm, string asm_alt, + OpndItins itins, ImmLeaf immLeaf> { + def rr : SIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))], + itins.rr>, Sched<[itins.Sched]>; + def rm : SIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, + [(set RC:$dst, (OpNode (VT RC:$src1), + (ld_frag addr:$src2), immLeaf:$cc))], + itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; + + // Accept explicit immediate argument form instead of comparison code. + let isAsmParserOnly = 1, hasSideEffects = 0 in { + def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, [], + IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>; + let mayLoad = 1 in + def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, [], + IIC_SSE_ALU_F32S_RM>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; + } +} + +defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32, + "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG; +defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64, + "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare + XD, VEX_4V, VEX_LIG; + +let Constraints = "$src1 = $dst" in { + defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32, + "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", + "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S, + i8immZExt3>, XS; + defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64, + "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", + "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", + SSE_ALU_F64S, i8immZExt3>, XD; +} + +multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC, + Intrinsic Int, string asm, OpndItins itins, + ImmLeaf immLeaf> { + def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src, CC:$cc), asm, + [(set VR128:$dst, (Int VR128:$src1, + VR128:$src, immLeaf:$cc))], + itins.rr>, + Sched<[itins.Sched]>; + def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, x86memop:$src, CC:$cc), asm, + [(set VR128:$dst, (Int VR128:$src1, + (load addr:$src), immLeaf:$cc))], + itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +let isCodeGenOnly = 1 in { + // Aliases to match intrinsics which expect XMM operand(s). + defm Int_VCMPSS : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss, + "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}", + SSE_ALU_F32S, i8immZExt5>, + XS, VEX_4V; + defm Int_VCMPSD : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd, + "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}", + SSE_ALU_F32S, i8immZExt5>, // same latency as f32 + XD, VEX_4V; + let Constraints = "$src1 = $dst" in { + defm Int_CMPSS : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss, + "cmp${cc}ss\t{$src, $dst|$dst, $src}", + SSE_ALU_F32S, i8immZExt3>, XS; + defm Int_CMPSD : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd, + "cmp${cc}sd\t{$src, $dst|$dst, $src}", + SSE_ALU_F64S, i8immZExt3>, + XD; +} +} + + +// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS +multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, + ValueType vt, X86MemOperand x86memop, + PatFrag ld_frag, string OpcodeStr> { + def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))], + IIC_SSE_COMIS_RR>, + Sched<[WriteFAdd]>; + def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (OpNode (vt RC:$src1), + (ld_frag addr:$src2)))], + IIC_SSE_COMIS_RM>, + Sched<[WriteFAddLd, ReadAfterLd]>; +} + +let Defs = [EFLAGS] in { + defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, + "ucomiss">, PS, VEX, VEX_LIG; + defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, + "ucomisd">, PD, VEX, VEX_LIG; + let Pattern = []<dag> in { + defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, + "comiss">, PS, VEX, VEX_LIG; + defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, + "comisd">, PD, VEX, VEX_LIG; + } + + let isCodeGenOnly = 1 in { + defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, + load, "ucomiss">, PS, VEX; + defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, + load, "ucomisd">, PD, VEX; + + defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, + load, "comiss">, PS, VEX; + defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, + load, "comisd">, PD, VEX; + } + defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, + "ucomiss">, PS; + defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, + "ucomisd">, PD; + + let Pattern = []<dag> in { + defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, + "comiss">, PS; + defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, + "comisd">, PD; + } + + let isCodeGenOnly = 1 in { + defm Int_UCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, + load, "ucomiss">, PS; + defm Int_UCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, + load, "ucomisd">, PD; + + defm Int_COMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load, + "comiss">, PS; + defm Int_COMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load, + "comisd">, PD; + } +} // Defs = [EFLAGS] + +// sse12_cmp_packed - sse 1 & 2 compare packed instructions +multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, + Operand CC, Intrinsic Int, string asm, + string asm_alt, Domain d, ImmLeaf immLeaf, + PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> { + let isCommutable = 1 in + def rri : PIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + [(set RC:$dst, (Int RC:$src1, RC:$src2, immLeaf:$cc))], + itins.rr, d>, + Sched<[WriteFAdd]>; + def rmi : PIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, + [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2), immLeaf:$cc))], + itins.rm, d>, + Sched<[WriteFAddLd, ReadAfterLd]>; + + // Accept explicit immediate argument form instead of comparison code. + let isAsmParserOnly = 1, hasSideEffects = 0 in { + def rri_alt : PIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), + asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>; + let mayLoad = 1 in + def rmi_alt : PIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), + asm_alt, [], itins.rm, d>, + Sched<[WriteFAddLd, ReadAfterLd]>; + } +} + +defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps, + "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V; +defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd, + "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V; +defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256, + "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L; +defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256, + "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L; +let Constraints = "$src1 = $dst" in { + defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps, + "cmp${cc}ps\t{$src2, $dst|$dst, $src2}", + "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", + SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS; + defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd, + "cmp${cc}pd\t{$src2, $dst|$dst, $src2}", + "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", + SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD; +} + +let Predicates = [HasAVX] in { +def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), + (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; +def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)), + (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; +def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), + (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; +def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)), + (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; + +def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)), + (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>; +def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)), + (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>; +def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)), + (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>; +def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)), + (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>; +} + +let Predicates = [UseSSE1] in { +def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), + (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; +def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)), + (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; +} + +let Predicates = [UseSSE2] in { +def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), + (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; +def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)), + (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Shuffle Instructions +//===----------------------------------------------------------------------===// + +/// sse12_shuffle - sse 1 & 2 fp shuffle instructions +multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, + ValueType vt, string asm, PatFrag mem_frag, + Domain d> { + def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, + [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), + (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, + Sched<[WriteFShuffleLd, ReadAfterLd]>; + def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), asm, + [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, + (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, + Sched<[WriteFShuffle]>; +} + +let Predicates = [HasAVX, NoVLX] in { + defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, + "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + loadv4f32, SSEPackedSingle>, PS, VEX_4V; + defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, + "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L; + defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, + "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + loadv2f64, SSEPackedDouble>, PD, VEX_4V; + defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, + "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L; +} +let Constraints = "$src1 = $dst" in { + defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, + "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", + memopv4f32, SSEPackedSingle>, PS; + defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, + "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", + memopv2f64, SSEPackedDouble>, PD; +} + +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v4i32 (X86Shufp VR128:$src1, + (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))), + (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; + def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; + + def : Pat<(v2i64 (X86Shufp VR128:$src1, + (loadv2i64 addr:$src2), (i8 imm:$imm))), + (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>; + def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; + + // 256-bit patterns + def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>; + def : Pat<(v8i32 (X86Shufp VR256:$src1, + (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))), + (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>; + + def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>; + def : Pat<(v4i64 (X86Shufp VR256:$src1, + (loadv4i64 addr:$src2), (i8 imm:$imm))), + (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>; +} + +let Predicates = [UseSSE1] in { + def : Pat<(v4i32 (X86Shufp VR128:$src1, + (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))), + (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; + def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; +} + +let Predicates = [UseSSE2] in { + // Generic SHUFPD patterns + def : Pat<(v2i64 (X86Shufp VR128:$src1, + (memopv2i64 addr:$src2), (i8 imm:$imm))), + (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>; + def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Unpack FP Instructions +//===----------------------------------------------------------------------===// + +/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave +multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, + PatFrag mem_frag, RegisterClass RC, + X86MemOperand x86memop, string asm, + Domain d> { + def rr : PI<opc, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2), + asm, [(set RC:$dst, + (vt (OpNode RC:$src1, RC:$src2)))], + IIC_SSE_UNPCK, d>, Sched<[WriteFShuffle]>; + def rm : PI<opc, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + asm, [(set RC:$dst, + (vt (OpNode RC:$src1, + (mem_frag addr:$src2))))], + IIC_SSE_UNPCK, d>, + Sched<[WriteFShuffleLd, ReadAfterLd]>; +} + +let Predicates = [HasAVX, NoVLX] in { +defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32, + VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, PS, VEX_4V; +defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64, + VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, PD, VEX_4V; +defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32, + VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, PS, VEX_4V; +defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64, + VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, PD, VEX_4V; + +defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32, + VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, PS, VEX_4V, VEX_L; +defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64, + VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, PD, VEX_4V, VEX_L; +defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32, + VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, PS, VEX_4V, VEX_L; +defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64, + VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, PD, VEX_4V, VEX_L; +}// Predicates = [HasAVX, NoVLX] +let Constraints = "$src1 = $dst" in { + defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, + VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", + SSEPackedSingle>, PS; + defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64, + VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", + SSEPackedDouble>, PD; + defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32, + VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", + SSEPackedSingle>, PS; + defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64, + VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", + SSEPackedDouble>, PD; +} // Constraints = "$src1 = $dst" + +let Predicates = [HasAVX1Only] in { + def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), + (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), + (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), + (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), + (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))), + (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; + def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), + (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))), + (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; + def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), + (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Extract Floating-Point Sign mask +//===----------------------------------------------------------------------===// + +/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave +multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm, + Domain d> { + def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set GR32orGR64:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>, + Sched<[WriteVecLogic]>; +} + +let Predicates = [HasAVX] in { + defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, + "movmskps", SSEPackedSingle>, PS, VEX; + defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, + "movmskpd", SSEPackedDouble>, PD, VEX; + defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256, + "movmskps", SSEPackedSingle>, PS, + VEX, VEX_L; + defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256, + "movmskpd", SSEPackedDouble>, PD, + VEX, VEX_L; + + def : Pat<(i32 (X86fgetsign FR32:$src)), + (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(i64 (X86fgetsign FR32:$src)), + (SUBREG_TO_REG (i64 0), + (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>; + def : Pat<(i32 (X86fgetsign FR64:$src)), + (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(i64 (X86fgetsign FR64:$src)), + (SUBREG_TO_REG (i64 0), + (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>; +} + +defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps", + SSEPackedSingle>, PS; +defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd", + SSEPackedDouble>, PD; + +def : Pat<(i32 (X86fgetsign FR32:$src)), + (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>, + Requires<[UseSSE1]>; +def : Pat<(i64 (X86fgetsign FR32:$src)), + (SUBREG_TO_REG (i64 0), + (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>, + Requires<[UseSSE1]>; +def : Pat<(i32 (X86fgetsign FR64:$src)), + (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>, + Requires<[UseSSE2]>; +def : Pat<(i64 (X86fgetsign FR64:$src)), + (SUBREG_TO_REG (i64 0), + (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>, + Requires<[UseSSE2]>; + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Logical Instructions +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { // SSE integer instructions + +/// PDI_binop_rm - Simple SSE2 binary operator. +multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, OpndItins itins, + bit IsCommutable, bit Is2Addr> { + let isCommutable = IsCommutable in + def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>, + Sched<[itins.Sched]>; + def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)))))], + itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} +} // ExeDomain = SSEPackedInt + +multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, + ValueType OpVT128, ValueType OpVT256, + OpndItins itins, bit IsCommutable = 0, Predicate prd> { +let Predicates = [HasAVX, prd] in + defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, + VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V; + +let Constraints = "$src1 = $dst" in + defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, + memopv2i64, i128mem, itins, IsCommutable, 1>; + +let Predicates = [HasAVX2, prd] in + defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, + OpVT256, VR256, loadv4i64, i256mem, itins, + IsCommutable, 0>, VEX_4V, VEX_L; +} + +// These are ordered here for pattern ordering requirements with the fp versions + +defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, + SSE_VEC_BIT_ITINS_P, 1, NoVLX>; +defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, + SSE_VEC_BIT_ITINS_P, 1, NoVLX>; +defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, + SSE_VEC_BIT_ITINS_P, 1, NoVLX>; +defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, + SSE_VEC_BIT_ITINS_P, 0, NoVLX>; + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Logical Instructions +//===----------------------------------------------------------------------===// + +// Multiclass for scalars using the X86 logical operation aliases for FP. +multiclass sse12_fp_packed_scalar_logical_alias< + bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { + defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, + FR32, f32, f128mem, loadf32_128, SSEPackedSingle, itins, 0>, + PS, VEX_4V; + + defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, + FR64, f64, f128mem, loadf64_128, SSEPackedDouble, itins, 0>, + PD, VEX_4V; + + let Constraints = "$src1 = $dst" in { + defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32, + f32, f128mem, memopfsf32_128, SSEPackedSingle, itins>, PS; + + defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64, + f64, f128mem, memopfsf64_128, SSEPackedDouble, itins>, PD; + } +} + +let isCodeGenOnly = 1 in { + defm FsAND : sse12_fp_packed_scalar_logical_alias<0x54, "and", X86fand, + SSE_BIT_ITINS_P>; + defm FsOR : sse12_fp_packed_scalar_logical_alias<0x56, "or", X86for, + SSE_BIT_ITINS_P>; + defm FsXOR : sse12_fp_packed_scalar_logical_alias<0x57, "xor", X86fxor, + SSE_BIT_ITINS_P>; + + let isCommutable = 0 in + defm FsANDN : sse12_fp_packed_scalar_logical_alias<0x55, "andn", X86fandn, + SSE_BIT_ITINS_P>; +} + +// Multiclass for vectors using the X86 logical operation aliases for FP. +multiclass sse12_fp_packed_vector_logical_alias< + bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { + let Predicates = [HasAVX, NoVLX_Or_NoDQI] in { + defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, + VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>, + PS, VEX_4V; + + defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, + VR128, v2f64, f128mem, loadv2f64, SSEPackedDouble, itins, 0>, + PD, VEX_4V; + + defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, + VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>, + PS, VEX_4V, VEX_L; + + defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, + VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>, + PD, VEX_4V, VEX_L; + } + + let Constraints = "$src1 = $dst" in { + defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, + v4f32, f128mem, memopv4f32, SSEPackedSingle, itins>, + PS; + + defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, + v2f64, f128mem, memopv2f64, SSEPackedDouble, itins>, + PD; + } +} + +let isCodeGenOnly = 1 in { + defm FvAND : sse12_fp_packed_vector_logical_alias<0x54, "and", X86fand, + SSE_BIT_ITINS_P>; + defm FvOR : sse12_fp_packed_vector_logical_alias<0x56, "or", X86for, + SSE_BIT_ITINS_P>; + defm FvXOR : sse12_fp_packed_vector_logical_alias<0x57, "xor", X86fxor, + SSE_BIT_ITINS_P>; + + let isCommutable = 0 in + defm FvANDN : sse12_fp_packed_vector_logical_alias<0x55, "andn", X86fandn, + SSE_BIT_ITINS_P>; +} + +/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops +/// +multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + let Predicates = [HasAVX, NoVLX] in { + defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, + !strconcat(OpcodeStr, "ps"), f256mem, + [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))], + [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)), + (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L; + + defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, + !strconcat(OpcodeStr, "pd"), f256mem, + [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), + (bc_v4i64 (v4f64 VR256:$src2))))], + [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), + (loadv4i64 addr:$src2)))], 0>, + PD, VEX_4V, VEX_L; + + // In AVX no need to add a pattern for 128-bit logical rr ps, because they + // are all promoted to v2i64, and the patterns are covered by the int + // version. This is needed in SSE only, because v2i64 isn't supported on + // SSE1, but only on SSE2. + defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, + !strconcat(OpcodeStr, "ps"), f128mem, [], + [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), + (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V; + + defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, + !strconcat(OpcodeStr, "pd"), f128mem, + [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), + (bc_v2i64 (v2f64 VR128:$src2))))], + [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), + (loadv2i64 addr:$src2)))], 0>, + PD, VEX_4V; + } + + let Constraints = "$src1 = $dst" in { + defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, + !strconcat(OpcodeStr, "ps"), f128mem, + [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))], + [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), + (memopv2i64 addr:$src2)))]>, PS; + + defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, + !strconcat(OpcodeStr, "pd"), f128mem, + [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), + (bc_v2i64 (v2f64 VR128:$src2))))], + [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), + (memopv2i64 addr:$src2)))]>, PD; + } +} + +defm AND : sse12_fp_packed_logical<0x54, "and", and>; +defm OR : sse12_fp_packed_logical<0x56, "or", or>; +defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>; +let isCommutable = 0 in + defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>; + +// AVX1 requires type coercions in order to fold loads directly into logical +// operations. +let Predicates = [HasAVX1Only] in { + def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))), + (VANDPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))), + (VORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))), + (VXORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))), + (VANDNPSYrm VR256:$src1, addr:$src2)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Arithmetic Instructions +//===----------------------------------------------------------------------===// + +/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and +/// vector forms. +/// +/// In addition, we also have a special variant of the scalar form here to +/// represent the associated intrinsic operation. This form is unlike the +/// plain scalar form, in that it takes an entire vector (instead of a scalar) +/// and leaves the top elements unmodified (therefore these cannot be commuted). +/// +/// These three forms can each be reg+reg or reg+mem. +/// + +/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those +/// classes below +multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, + SDNode OpNode, SizeItins itins> { + let Predicates = [HasAVX, NoVLX] in { + defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, + VR128, v4f32, f128mem, loadv4f32, + SSEPackedSingle, itins.s, 0>, PS, VEX_4V; + defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, + VR128, v2f64, f128mem, loadv2f64, + SSEPackedDouble, itins.d, 0>, PD, VEX_4V; + + defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), + OpNode, VR256, v8f32, f256mem, loadv8f32, + SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L; + defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), + OpNode, VR256, v4f64, f256mem, loadv4f64, + SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L; + } + + let Constraints = "$src1 = $dst" in { + defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, + v4f32, f128mem, memopv4f32, SSEPackedSingle, + itins.s>, PS; + defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, + v2f64, f128mem, memopv2f64, SSEPackedDouble, + itins.d>, PD; + } +} + +multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + SizeItins itins> { + defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), + OpNode, FR32, f32mem, SSEPackedSingle, itins.s, 0>, + XS, VEX_4V, VEX_LIG; + defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), + OpNode, FR64, f64mem, SSEPackedDouble, itins.d, 0>, + XD, VEX_4V, VEX_LIG; + + let Constraints = "$src1 = $dst" in { + defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), + OpNode, FR32, f32mem, SSEPackedSingle, + itins.s>, XS; + defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), + OpNode, FR64, f64mem, SSEPackedDouble, + itins.d>, XD; + } +} + +multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, + SizeItins itins> { + defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128, + !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, + SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG; + defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128, + !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, + SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG; + + let Constraints = "$src1 = $dst" in { + defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128, + !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, + SSEPackedSingle, itins.s>, XS; + defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128, + !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, + SSEPackedDouble, itins.d>, XD; + } +} + +// Binary Arithmetic instructions +defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>, + basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>, + basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>; +defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>, + basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>, + basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>; +let isCommutable = 0 in { + defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>, + basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>, + basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>; + defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>, + basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>, + basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>; + defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>, + basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>, + basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>; + defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>, + basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>, + basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>; +} + +let isCodeGenOnly = 1 in { + defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>, + basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>; + defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>, + basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>; +} + +// Patterns used to select SSE scalar fp arithmetic instructions from +// either: +// +// (1) a scalar fp operation followed by a blend +// +// The effect is that the backend no longer emits unnecessary vector +// insert instructions immediately after SSE scalar fp instructions +// like addss or mulss. +// +// For example, given the following code: +// __m128 foo(__m128 A, __m128 B) { +// A[0] += B[0]; +// return A; +// } +// +// Previously we generated: +// addss %xmm0, %xmm1 +// movss %xmm1, %xmm0 +// +// We now generate: +// addss %xmm1, %xmm0 +// +// (2) a vector packed single/double fp operation followed by a vector insert +// +// The effect is that the backend converts the packed fp instruction +// followed by a vector insert into a single SSE scalar fp instruction. +// +// For example, given the following code: +// __m128 foo(__m128 A, __m128 B) { +// __m128 C = A + B; +// return (__m128) {c[0], a[1], a[2], a[3]}; +// } +// +// Previously we generated: +// addps %xmm0, %xmm1 +// movss %xmm1, %xmm0 +// +// We now generate: +// addss %xmm1, %xmm0 + +// TODO: Some canonicalization in lowering would simplify the number of +// patterns we have to try to match. +multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> { + let Predicates = [UseSSE1] in { + // extracted scalar math op with insert via movss + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))))), + (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32:$src, VR128))>; + + // vector math op with insert via movss + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; + } + + // With SSE 4.1, blendi is preferred to movsd, so match that too. + let Predicates = [UseSSE41] in { + // extracted scalar math op with insert via blend + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32:$src, VR128))>; + + // vector math op with insert via blend + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (!cast<I>(OpcPrefix#SSrr_Int)v4f32:$dst, v4f32:$src)>; + + } + + // Repeat everything for AVX, except for the movss + scalar combo... + // because that one shouldn't occur with AVX codegen? + let Predicates = [HasAVX] in { + // extracted scalar math op with insert via blend + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector + (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, + (COPY_TO_REGCLASS FR32:$src, VR128))>; + + // vector math op with insert via movss + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))), + (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; + + // vector math op with insert via blend + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; + } +} + +defm : scalar_math_f32_patterns<fadd, "ADD">; +defm : scalar_math_f32_patterns<fsub, "SUB">; +defm : scalar_math_f32_patterns<fmul, "MUL">; +defm : scalar_math_f32_patterns<fdiv, "DIV">; + +multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> { + let Predicates = [UseSSE2] in { + // extracted scalar math op with insert via movsd + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector + (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))))), + (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, + (COPY_TO_REGCLASS FR64:$src, VR128))>; + + // vector math op with insert via movsd + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; + } + + // With SSE 4.1, blendi is preferred to movsd, so match those too. + let Predicates = [UseSSE41] in { + // extracted scalar math op with insert via blend + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector + (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, + (COPY_TO_REGCLASS FR64:$src, VR128))>; + + // vector math op with insert via blend + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; + } + + // Repeat everything for AVX. + let Predicates = [HasAVX] in { + // extracted scalar math op with insert via movsd + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector + (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))))), + (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, + (COPY_TO_REGCLASS FR64:$src, VR128))>; + + // extracted scalar math op with insert via blend + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector + (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, + (COPY_TO_REGCLASS FR64:$src, VR128))>; + + // vector math op with insert via movsd + def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), + (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))), + (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; + + // vector math op with insert via blend + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; + } +} + +defm : scalar_math_f64_patterns<fadd, "ADD">; +defm : scalar_math_f64_patterns<fsub, "SUB">; +defm : scalar_math_f64_patterns<fmul, "MUL">; +defm : scalar_math_f64_patterns<fdiv, "DIV">; + + +/// Unop Arithmetic +/// In addition, we also have a special variant of the scalar form here to +/// represent the associated intrinsic operation. This form is unlike the +/// plain scalar form, in that it takes an entire vector (instead of a +/// scalar) and leaves the top elements undefined. +/// +/// And, we have a special variant form for a full-vector intrinsic form. + +let Sched = WriteFSqrt in { +def SSE_SQRTPS : OpndItins< + IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM +>; + +def SSE_SQRTSS : OpndItins< + IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM +>; + +def SSE_SQRTPD : OpndItins< + IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM +>; + +def SSE_SQRTSD : OpndItins< + IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM +>; +} + +let Sched = WriteFRsqrt in { +def SSE_RSQRTPS : OpndItins< + IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM +>; + +def SSE_RSQRTSS : OpndItins< + IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM +>; +} + +let Sched = WriteFRcp in { +def SSE_RCPP : OpndItins< + IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM +>; + +def SSE_RCPS : OpndItins< + IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM +>; +} + +/// sse_fp_unop_s - SSE1 unops in scalar form +/// For the non-AVX defs, we need $src1 to be tied to $dst because +/// the HW instructions are 2 operand / destructive. +multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, + ValueType vt, ValueType ScalarVT, + X86MemOperand x86memop, Operand vec_memop, + ComplexPattern mem_cpat, Intrinsic Intr, + SDNode OpNode, Domain d, OpndItins itins, + Predicate target, string Suffix> { + let hasSideEffects = 0 in { + def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1), + !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), + [(set RC:$dst, (OpNode RC:$src1))], itins.rr, d>, Sched<[itins.Sched]>, + Requires<[target]>; + let mayLoad = 1 in + def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1), + !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), + [(set RC:$dst, (OpNode (load addr:$src1)))], itins.rm, d>, + Sched<[itins.Sched.Folded, ReadAfterLd]>, + Requires<[target, OptForSize]>; + + let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in { + def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + []>, Sched<[itins.Sched.Folded, ReadAfterLd]>; + let mayLoad = 1 in + def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, vec_memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + []>, Sched<[itins.Sched.Folded, ReadAfterLd]>; + } + } + + let Predicates = [target] in { + def : Pat<(vt (OpNode mem_cpat:$src)), + (vt (COPY_TO_REGCLASS (vt (!cast<Instruction>(NAME#Suffix##m_Int) + (vt (IMPLICIT_DEF)), mem_cpat:$src)), RC))>; + // These are unary operations, but they are modeled as having 2 source operands + // because the high elements of the destination are unchanged in SSE. + def : Pat<(Intr VR128:$src), + (!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>; + def : Pat<(Intr (load addr:$src)), + (vt (COPY_TO_REGCLASS(!cast<Instruction>(NAME#Suffix##m) + addr:$src), VR128))>; + } + // We don't want to fold scalar loads into these instructions unless + // optimizing for size. This is because the folded instruction will have a + // partial register update, while the unfolded sequence will not, e.g. + // movss mem, %xmm0 + // rcpss %xmm0, %xmm0 + // which has a clobber before the rcp, vs. + // rcpss mem, %xmm0 + let Predicates = [target, OptForSize] in { + def : Pat<(Intr mem_cpat:$src), + (!cast<Instruction>(NAME#Suffix##m_Int) + (vt (IMPLICIT_DEF)), mem_cpat:$src)>; + } +} + +multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, + ValueType vt, ValueType ScalarVT, + X86MemOperand x86memop, Operand vec_memop, + ComplexPattern mem_cpat, + Intrinsic Intr, SDNode OpNode, Domain d, + OpndItins itins, string Suffix> { + let hasSideEffects = 0 in { + def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [], itins.rr, d>, Sched<[itins.Sched]>; + let mayLoad = 1 in + def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>; + let isCodeGenOnly = 1 in { + def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[itins.Sched.Folded]>; + let mayLoad = 1 in + def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, vec_memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[itins.Sched.Folded, ReadAfterLd]>; + } + } + + // We don't want to fold scalar loads into these instructions unless + // optimizing for size. This is because the folded instruction will have a + // partial register update, while the unfolded sequence will not, e.g. + // vmovss mem, %xmm0 + // vrcpss %xmm0, %xmm0, %xmm0 + // which has a clobber before the rcp, vs. + // vrcpss mem, %xmm0, %xmm0 + // TODO: In theory, we could fold the load, and avoid the stall caused by + // the partial register store, either in ExeDepFix or with smarter RA. + let Predicates = [UseAVX] in { + def : Pat<(OpNode RC:$src), (!cast<Instruction>("V"#NAME#Suffix##r) + (ScalarVT (IMPLICIT_DEF)), RC:$src)>; + } + let Predicates = [HasAVX] in { + def : Pat<(Intr VR128:$src), + (!cast<Instruction>("V"#NAME#Suffix##r_Int) (vt (IMPLICIT_DEF)), + VR128:$src)>; + } + let Predicates = [HasAVX, OptForSize] in { + def : Pat<(Intr mem_cpat:$src), + (!cast<Instruction>("V"#NAME#Suffix##m_Int) + (vt (IMPLICIT_DEF)), mem_cpat:$src)>; + } + let Predicates = [UseAVX, OptForSize] in { + def : Pat<(ScalarVT (OpNode (load addr:$src))), + (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)), + addr:$src)>; + def : Pat<(vt (OpNode mem_cpat:$src)), + (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), + mem_cpat:$src)>; + } +} + +/// sse1_fp_unop_p - SSE1 unops in packed form. +multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, list<Predicate> prds> { +let Predicates = prds in { + def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat("v", OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], + itins.rr>, VEX, Sched<[itins.Sched]>; + def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat("v", OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))], + itins.rm>, VEX, Sched<[itins.Sched.Folded]>; + def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat("v", OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))], + itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>; + def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat("v", OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))], + itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>; +} + + def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>, + Sched<[itins.Sched]>; + def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>, + Sched<[itins.Sched.Folded]>; +} + +/// sse2_fp_unop_p - SSE2 unops in vector forms. +multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, + SDNode OpNode, OpndItins itins> { +let Predicates = [HasAVX] in { + def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat("v", OpcodeStr, + "pd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], + itins.rr>, VEX, Sched<[itins.Sched]>; + def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat("v", OpcodeStr, + "pd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))], + itins.rm>, VEX, Sched<[itins.Sched.Folded]>; + def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat("v", OpcodeStr, + "pd\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))], + itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>; + def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat("v", OpcodeStr, + "pd\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))], + itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>; +} + + def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>, + Sched<[itins.Sched]>; + def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>, + Sched<[itins.Sched.Folded]>; +} + +multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins> { + defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem, + ssmem, sse_load_f32, + !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, + SSEPackedSingle, itins, UseSSE1, "SS">, XS; + defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32, + f32mem, ssmem, sse_load_f32, + !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, + SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG; +} + +multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins> { + defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem, + sdmem, sse_load_f64, + !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), + OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD; + defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64, + f64mem, sdmem, sse_load_f64, + !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), + OpNode, SSEPackedDouble, itins, "SD">, + XD, VEX_4V, VEX_LIG; +} + +// Square root. +defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>, + sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>, + sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>, + sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; + +// Reciprocal approximations. Note that these typically require refinement +// in order to obtain suitable precision. +defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >; +defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>, + sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>; + +// There is no f64 version of the reciprocal approximation instructions. + +// TODO: We should add *scalar* op patterns for these just like we have for +// the binops above. If the binop and unop patterns could all be unified +// that would be even better. + +multiclass scalar_unary_math_patterns<Intrinsic Intr, string OpcPrefix, + SDNode Move, ValueType VT, + Predicate BasePredicate> { + let Predicates = [BasePredicate] in { + def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), + (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; + } + + // With SSE 4.1, blendi is preferred to movs*, so match that too. + let Predicates = [UseSSE41] in { + def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))), + (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; + } + + // Repeat for AVX versions of the instructions. + let Predicates = [HasAVX] in { + def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), + (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; + + def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))), + (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; + } +} + +defm : scalar_unary_math_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss, + v4f32, UseSSE1>; +defm : scalar_unary_math_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss, + v4f32, UseSSE1>; +defm : scalar_unary_math_patterns<int_x86_sse_sqrt_ss, "SQRTSS", X86Movss, + v4f32, UseSSE1>; +defm : scalar_unary_math_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd, + v2f64, UseSSE2>; + + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Non-temporal stores +//===----------------------------------------------------------------------===// + +let AddedComplexity = 400 in { // Prefer non-temporal versions +let SchedRW = [WriteStore] in { +let Predicates = [HasAVX, NoVLX] in { +def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), + addr:$dst)], + IIC_SSE_MOVNT>, VEX; +def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v2f64 VR128:$src), + addr:$dst)], + IIC_SSE_MOVNT>, VEX; + +let ExeDomain = SSEPackedInt in +def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v2i64 VR128:$src), + addr:$dst)], + IIC_SSE_MOVNT>, VEX; + +def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v8f32 VR256:$src), + addr:$dst)], + IIC_SSE_MOVNT>, VEX, VEX_L; +def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f64 VR256:$src), + addr:$dst)], + IIC_SSE_MOVNT>, VEX, VEX_L; +let ExeDomain = SSEPackedInt in +def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4i64 VR256:$src), + addr:$dst)], + IIC_SSE_MOVNT>, VEX, VEX_L; +} + +def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)], + IIC_SSE_MOVNT>; +def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)], + IIC_SSE_MOVNT>; + +let ExeDomain = SSEPackedInt in +def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)], + IIC_SSE_MOVNT>; + +// There is no AVX form for instructions below this point +def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "movnti{l}\t{$src, $dst|$dst, $src}", + [(nontemporalstore (i32 GR32:$src), addr:$dst)], + IIC_SSE_MOVNT>, + PS, Requires<[HasSSE2]>; +def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "movnti{q}\t{$src, $dst|$dst, $src}", + [(nontemporalstore (i64 GR64:$src), addr:$dst)], + IIC_SSE_MOVNT>, + PS, Requires<[HasSSE2]>; +} // SchedRW = [WriteStore] + +let Predicates = [HasAVX2, NoVLX] in { + def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst), + (VMOVNTDQYmr addr:$dst, VR256:$src)>; + def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst), + (VMOVNTDQYmr addr:$dst, VR256:$src)>; + def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst), + (VMOVNTDQYmr addr:$dst, VR256:$src)>; +} + +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), + (VMOVNTDQmr addr:$dst, VR128:$src)>; + def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), + (VMOVNTDQmr addr:$dst, VR128:$src)>; + def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), + (VMOVNTDQmr addr:$dst, VR128:$src)>; +} + +def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), + (MOVNTDQmr addr:$dst, VR128:$src)>; +def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), + (MOVNTDQmr addr:$dst, VR128:$src)>; +def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), + (MOVNTDQmr addr:$dst, VR128:$src)>; + +} // AddedComplexity + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Prefetch and memory fence +//===----------------------------------------------------------------------===// + +// Prefetch intrinsic. +let Predicates = [HasSSE1], SchedRW = [WriteLoad] in { +def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src), + "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))], + IIC_SSE_PREFETCH>, TB; +def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src), + "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))], + IIC_SSE_PREFETCH>, TB; +def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src), + "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))], + IIC_SSE_PREFETCH>, TB; +def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src), + "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))], + IIC_SSE_PREFETCH>, TB; +} + +// FIXME: How should flush instruction be modeled? +let SchedRW = [WriteLoad] in { +// Flush cache +def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), + "clflush\t$src", [(int_x86_sse2_clflush addr:$src)], + IIC_SSE_PREFETCH>, PS, Requires<[HasSSE2]>; +} + +let SchedRW = [WriteNop] in { +// Pause. This "instruction" is encoded as "rep; nop", so even though it +// was introduced with SSE2, it's backward compatible. +def PAUSE : I<0x90, RawFrm, (outs), (ins), + "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, + OBXS, Requires<[HasSSE2]>; +} + +let SchedRW = [WriteFence] in { +// Load, store, and memory fence +def SFENCE : I<0xAE, MRM_F8, (outs), (ins), + "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>, + PS, Requires<[HasSSE1]>; +def LFENCE : I<0xAE, MRM_E8, (outs), (ins), + "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>, + TB, Requires<[HasSSE2]>; +def MFENCE : I<0xAE, MRM_F0, (outs), (ins), + "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>, + TB, Requires<[HasSSE2]>; +} // SchedRW + +def : Pat<(X86SFence), (SFENCE)>; +def : Pat<(X86LFence), (LFENCE)>; +def : Pat<(X86MFence), (MFENCE)>; + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Load/Store XCSR register +//===----------------------------------------------------------------------===// + +def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), + "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], + IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>; +def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), + "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], + IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>; + +let Predicates = [UseSSE1] in { +def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), + "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], + IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>; +def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), + "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], + IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>; +} + +//===---------------------------------------------------------------------===// +// SSE2 - Move Aligned/Unaligned Packed Integer Instructions +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { // SSE integer instructions + +let hasSideEffects = 0, SchedRW = [WriteMove] in { +def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, + VEX; +def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, + VEX, VEX_L; +def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>, + VEX; +def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>, + VEX, VEX_L; +} + +// For Disassembler +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, + SchedRW = [WriteMove] in { +def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, + VEX; +def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>, VEX, VEX_L; +def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>, + VEX; +def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), + "movdqu\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVU_P_RR>, VEX, VEX_L; +} + +let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, + hasSideEffects = 0, SchedRW = [WriteLoad] in { +def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>, + VEX; +def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>, + VEX, VEX_L; +let Predicates = [HasAVX] in { + def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>, + XS, VEX; + def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>, + XS, VEX, VEX_L; +} +} + +let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in { +def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), + (ins i128mem:$dst, VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>, + VEX; +def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), + (ins i256mem:$dst, VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>, + VEX, VEX_L; +let Predicates = [HasAVX] in { +def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>, + XS, VEX; +def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>, + XS, VEX, VEX_L; +} +} + +let SchedRW = [WriteMove] in { +let hasSideEffects = 0 in +def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>; + +def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>; + +// For Disassembler +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { +def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", [], + IIC_SSE_MOVA_P_RR>; + +def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>; +} +} // SchedRW + +let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, + hasSideEffects = 0, SchedRW = [WriteLoad] in { +def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", + [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/], + IIC_SSE_MOVA_P_RM>; +def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movdqu\t{$src, $dst|$dst, $src}", + [/*(set VR128:$dst, (loadv2i64 addr:$src))*/], + IIC_SSE_MOVU_P_RM>, + XS, Requires<[UseSSE2]>; +} + +let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in { +def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", + [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/], + IIC_SSE_MOVA_P_MR>; +def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", + [/*(store (v2i64 VR128:$src), addr:$dst)*/], + IIC_SSE_MOVU_P_MR>, + XS, Requires<[UseSSE2]>; +} + +} // ExeDomain = SSEPackedInt + +let Predicates = [HasAVX] in { + def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src), + (VMOVDQUmr addr:$dst, VR128:$src)>; + def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src), + (VMOVDQUYmr addr:$dst, VR256:$src)>; +} +let Predicates = [UseSSE2] in +def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src), + (MOVDQUmr addr:$dst, VR128:$src)>; + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Arithmetic Instructions +//===---------------------------------------------------------------------===// + +let Sched = WriteVecIMul in +def SSE_PMADD : OpndItins< + IIC_SSE_PMADD, IIC_SSE_PMADD +>; + +let ExeDomain = SSEPackedInt in { // SSE integer instructions + +multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, + RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, + OpndItins itins, + bit IsCommutable = 0, + bit Is2Addr = 1> { + let isCommutable = IsCommutable in + def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>, + Sched<[itins.Sched]>; + def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))], + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128, + Intrinsic IntId256, OpndItins itins, + bit IsCommutable = 0> { +let Predicates = [HasAVX] in + defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128, + VR128, loadv2i64, i128mem, itins, + IsCommutable, 0>, VEX_4V; + +let Constraints = "$src1 = $dst" in + defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64, + i128mem, itins, IsCommutable, 1>; + +let Predicates = [HasAVX2] in + defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256, + VR256, loadv4i64, i256mem, itins, + IsCommutable, 0>, VEX_4V, VEX_L; +} + +multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, + string OpcodeStr, SDNode OpNode, + SDNode OpNode2, RegisterClass RC, + ValueType DstVT, ValueType SrcVT, PatFrag bc_frag, + PatFrag ld_frag, ShiftOpndItins itins, + bit Is2Addr = 1> { + // src2 is always 128-bit + def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))], + itins.rr>, Sched<[WriteVecShift]>; + def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode RC:$src1, + (bc_frag (ld_frag addr:$src2)))))], itins.rm>, + Sched<[WriteVecShiftLd, ReadAfterLd]>; + def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), + (ins RC:$src1, u8imm:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], itins.ri>, + Sched<[WriteVecShift]>; +} + +/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types +multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType DstVT, ValueType SrcVT, RegisterClass RC, + PatFrag memop_frag, X86MemOperand x86memop, + OpndItins itins, + bit IsCommutable = 0, bit Is2Addr = 1> { + let isCommutable = IsCommutable in + def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, + Sched<[itins.Sched]>; + def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), + (bitconvert (memop_frag addr:$src2)))))]>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} +} // ExeDomain = SSEPackedInt + +defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, + SSE_INTALU_ITINS_P, 1, NoVLX>; +defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, + SSE_INTALUQ_ITINS_P, 1, NoVLX>; +defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, + SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, + SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16, + SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; +defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; +defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, + SSE_INTALU_ITINS_P, 0, NoVLX>; +defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, + SSE_INTALUQ_ITINS_P, 0, NoVLX>; +defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8, + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; +defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16, + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; +defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; + +// Intrinsic forms +defm PSUBSB : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b, + int_x86_avx2_psubs_b, SSE_INTALU_ITINS_P, 0>; +defm PSUBSW : PDI_binop_all_int<0xE9, "psubsw" , int_x86_sse2_psubs_w, + int_x86_avx2_psubs_w, SSE_INTALU_ITINS_P, 0>; +defm PADDSB : PDI_binop_all_int<0xEC, "paddsb" , int_x86_sse2_padds_b, + int_x86_avx2_padds_b, SSE_INTALU_ITINS_P, 1>; +defm PADDSW : PDI_binop_all_int<0xED, "paddsw" , int_x86_sse2_padds_w, + int_x86_avx2_padds_w, SSE_INTALU_ITINS_P, 1>; +defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b, + int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>; +defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w, + int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>; +defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, + int_x86_avx2_pmadd_wd, SSE_PMADD, 1>; + +let Predicates = [HasAVX] in +defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, + loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, + VEX_4V; +let Predicates = [HasAVX2] in +defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, + loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>, + VEX_4V, VEX_L; +let Constraints = "$src1 = $dst" in +defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, + memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1>; + +let Predicates = [HasAVX] in +defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, + loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, + VEX_4V; +let Predicates = [HasAVX2] in +defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32, + VR256, loadv4i64, i256mem, + SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L; +let Constraints = "$src1 = $dst" in +defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128, + memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>; + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Logical Instructions +//===---------------------------------------------------------------------===// + +let Predicates = [HasAVX, NoVLX] in { +defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, + VR128, v4i32, v4i32, bc_v4i32, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, + VR128, v2i64, v2i64, bc_v2i64, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; + +defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, + VR128, v4i32, v4i32, bc_v4i32, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, + VR128, v2i64, v2i64, bc_v2i64, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; + +defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, + VR128, v4i32, v4i32, bc_v4i32, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +} // Predicates = [HasAVX, NoVLX] + +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { +defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, + VR128, v8i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, + VR128, v8i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, + VR128, v8i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +} // Predicates = [HasAVX, NoVLX_Or_NoBWI] + + +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] , + Predicates = [HasAVX, NoVLX_Or_NoBWI]in { + // 128-bit logical shifts. + def VPSLLDQri : PDIi8<0x73, MRM7r, + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), + "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))]>, + VEX_4V; + def VPSRLDQri : PDIi8<0x73, MRM3r, + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), + "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>, + VEX_4V; + // PSRADQri doesn't exist in SSE[1-3]. +} // Predicates = [HasAVX, NoVLX_Or_NoBWI] + +let Predicates = [HasAVX2, NoVLX] in { +defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, + VR256, v8i32, v4i32, bc_v4i32, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, + VR256, v4i64, v2i64, bc_v2i64, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; + +defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, + VR256, v8i32, v4i32, bc_v4i32, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, + VR256, v4i64, v2i64, bc_v2i64, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; + +defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, + VR256, v8i32, v4i32, bc_v4i32, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +}// Predicates = [HasAVX2, NoVLX] + +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { +defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, + VR256, v16i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, + VR256, v16i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, + VR256, v16i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +}// Predicates = [HasAVX2, NoVLX_Or_NoBWI] + +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 , + Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { + // 256-bit logical shifts. + def VPSLLDQYri : PDIi8<0x73, MRM7r, + (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2), + "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR256:$dst, + (v4i64 (X86vshldq VR256:$src1, (i8 imm:$src2))))]>, + VEX_4V, VEX_L; + def VPSRLDQYri : PDIi8<0x73, MRM3r, + (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2), + "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR256:$dst, + (v4i64 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>, + VEX_4V, VEX_L; + // PSRADQYri doesn't exist in SSE[1-3]. +} // Predicates = [HasAVX2, NoVLX_Or_NoBWI] + +let Constraints = "$src1 = $dst" in { +defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, + VR128, v8i16, v8i16, bc_v8i16, memopv2i64, + SSE_INTSHIFT_ITINS_P>; +defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, + VR128, v4i32, v4i32, bc_v4i32, memopv2i64, + SSE_INTSHIFT_ITINS_P>; +defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, + VR128, v2i64, v2i64, bc_v2i64, memopv2i64, + SSE_INTSHIFT_ITINS_P>; + +defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, + VR128, v8i16, v8i16, bc_v8i16, memopv2i64, + SSE_INTSHIFT_ITINS_P>; +defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, + VR128, v4i32, v4i32, bc_v4i32, memopv2i64, + SSE_INTSHIFT_ITINS_P>; +defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, + VR128, v2i64, v2i64, bc_v2i64, memopv2i64, + SSE_INTSHIFT_ITINS_P>; + +defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, + VR128, v8i16, v8i16, bc_v8i16, memopv2i64, + SSE_INTSHIFT_ITINS_P>; +defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, + VR128, v4i32, v4i32, bc_v4i32, memopv2i64, + SSE_INTSHIFT_ITINS_P>; + +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { + // 128-bit logical shifts. + def PSLLDQri : PDIi8<0x73, MRM7r, + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), + "pslldq\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2i64 (X86vshldq VR128:$src1, (i8 imm:$src2))))], + IIC_SSE_INTSHDQ_P_RI>; + def PSRLDQri : PDIi8<0x73, MRM3r, + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), + "psrldq\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))], + IIC_SSE_INTSHDQ_P_RI>; + // PSRADQri doesn't exist in SSE[1-3]. +} +} // Constraints = "$src1 = $dst" + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Comparison Instructions +//===---------------------------------------------------------------------===// + +defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, + SSE_INTALU_ITINS_P, 1, NoVLX>; +defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; +defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; +defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, + SSE_INTALU_ITINS_P, 0, NoVLX>; + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Shuffle Instructions +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { +multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256, + SDNode OpNode> { +let Predicates = [HasAVX] in { + def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, u8imm:$src2), + !strconcat("v", OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], + IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>; + def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1, u8imm:$src2), + !strconcat("v", OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)), + (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, + Sched<[WriteShuffleLd]>; +} + +let Predicates = [HasAVX2] in { + def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, u8imm:$src2), + !strconcat("v", OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))], + IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>; + def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), + (ins i256mem:$src1, u8imm:$src2), + !strconcat("v", OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)), + (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L, + Sched<[WriteShuffleLd]>; +} + +let Predicates = [UseSSE2] in { + def ri : Ii8<0x70, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], + IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>; + def mi : Ii8<0x70, MRMSrcMem, + (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)), + (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} +} +} // ExeDomain = SSEPackedInt + +defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, PD; +defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS; +defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD; + +let Predicates = [HasAVX] in { + def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))), + (VPSHUFDmi addr:$src1, imm:$imm)>; + def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), + (VPSHUFDri VR128:$src1, imm:$imm)>; +} + +let Predicates = [UseSSE2] in { + def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))), + (PSHUFDmi addr:$src1, imm:$imm)>; + def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), + (PSHUFDri VR128:$src1, imm:$imm)>; +} + +//===---------------------------------------------------------------------===// +// Packed Integer Pack Instructions (SSE & AVX) +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { +multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, + PatFrag ld_frag, bit Is2Addr = 1> { + def rr : PDI<opc, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>, + Sched<[WriteShuffle]>; + def rm : PDI<opc, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (OutVT (OpNode VR128:$src1, + (bc_frag (ld_frag addr:$src2)))))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> { + def Yrr : PDI<opc, MRMSrcReg, + (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>, + Sched<[WriteShuffle]>; + def Yrm : PDI<opc, MRMSrcMem, + (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OutVT (OpNode VR256:$src1, + (bc_frag (loadv4i64 addr:$src2)))))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag, + PatFrag ld_frag, bit Is2Addr = 1> { + def rr : SS48I<opc, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>, + Sched<[WriteShuffle]>; + def rm : SS48I<opc, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (OutVT (OpNode VR128:$src1, + (bc_frag (ld_frag addr:$src2)))))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, PatFrag bc_frag> { + def Yrr : SS48I<opc, MRMSrcReg, + (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>, + Sched<[WriteShuffle]>; + def Yrm : SS48I<opc, MRMSrcMem, + (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OutVT (OpNode VR256:$src1, + (bc_frag (loadv4i64 addr:$src2)))))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +let Predicates = [HasAVX] in { + defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, + bc_v8i16, loadv2i64, 0>, VEX_4V; + defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, + bc_v4i32, loadv2i64, 0>, VEX_4V; + + defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, + bc_v8i16, loadv2i64, 0>, VEX_4V; + defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, + bc_v4i32, loadv2i64, 0>, VEX_4V; +} + +let Predicates = [HasAVX2] in { + defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss, + bc_v16i16>, VEX_4V, VEX_L; + defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, + bc_v8i32>, VEX_4V, VEX_L; + + defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus, + bc_v16i16>, VEX_4V, VEX_L; + defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, + bc_v8i32>, VEX_4V, VEX_L; +} + +let Constraints = "$src1 = $dst" in { + defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, + bc_v8i16, memopv2i64>; + defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, + bc_v4i32, memopv2i64>; + + defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, + bc_v8i16, memopv2i64>; + + let Predicates = [HasSSE41] in + defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, + bc_v4i32, memopv2i64>; +} +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Unpack Instructions +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { +multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, + SDNode OpNode, PatFrag bc_frag, PatFrag ld_frag, + bit Is2Addr = 1> { + def rr : PDI<opc, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], + IIC_SSE_UNPCK>, Sched<[WriteShuffle]>; + def rm : PDI<opc, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (OpNode VR128:$src1, + (bc_frag (ld_frag addr:$src2))))], + IIC_SSE_UNPCK>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt, + SDNode OpNode, PatFrag bc_frag> { + def Yrr : PDI<opc, MRMSrcReg, + (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>, + Sched<[WriteShuffle]>; + def Yrm : PDI<opc, MRMSrcMem, + (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (OpNode VR256:$src1, + (bc_frag (loadv4i64 addr:$src2))))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + + +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { + defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, + bc_v16i8, loadv2i64, 0>, VEX_4V; + defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, + bc_v8i16, loadv2i64, 0>, VEX_4V; + defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, + bc_v16i8, loadv2i64, 0>, VEX_4V; + defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, + bc_v8i16, loadv2i64, 0>, VEX_4V; +} +let Predicates = [HasAVX, NoVLX] in { + defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, + bc_v4i32, loadv2i64, 0>, VEX_4V; + defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, + bc_v2i64, loadv2i64, 0>, VEX_4V; + defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, + bc_v4i32, loadv2i64, 0>, VEX_4V; + defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, + bc_v2i64, loadv2i64, 0>, VEX_4V; +} + +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { + defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl, + bc_v32i8>, VEX_4V, VEX_L; + defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl, + bc_v16i16>, VEX_4V, VEX_L; + defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh, + bc_v32i8>, VEX_4V, VEX_L; + defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh, + bc_v16i16>, VEX_4V, VEX_L; +} +let Predicates = [HasAVX2, NoVLX] in { + defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl, + bc_v8i32>, VEX_4V, VEX_L; + defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, + bc_v4i64>, VEX_4V, VEX_L; + defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh, + bc_v8i32>, VEX_4V, VEX_L; + defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, + bc_v4i64>, VEX_4V, VEX_L; +} + +let Constraints = "$src1 = $dst" in { + defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, + bc_v16i8, memopv2i64>; + defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, + bc_v8i16, memopv2i64>; + defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, + bc_v4i32, memopv2i64>; + defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, + bc_v2i64, memopv2i64>; + + defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, + bc_v16i8, memopv2i64>; + defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, + bc_v8i16, memopv2i64>; + defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, + bc_v4i32, memopv2i64>; + defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, + bc_v2i64, memopv2i64>; +} +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Extract and Insert +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { +multiclass sse2_pinsrw<bit Is2Addr = 1> { + def rri : Ii8<0xC4, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, + GR32orGR64:$src2, u8imm:$src3), + !if(Is2Addr, + "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", + "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))], + IIC_SSE_PINSRW>, Sched<[WriteShuffle]>; + def rmi : Ii8<0xC4, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, + i16mem:$src2, u8imm:$src3), + !if(Is2Addr, + "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", + "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), + imm:$src3))], IIC_SSE_PINSRW>, + Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +// Extract +let Predicates = [HasAVX, NoBWI] in +def VPEXTRWri : Ii8<0xC5, MRMSrcReg, + (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), + "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), + imm:$src2))]>, PD, VEX, + Sched<[WriteShuffle]>; +def PEXTRWri : PDIi8<0xC5, MRMSrcReg, + (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), + "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), + imm:$src2))], IIC_SSE_PEXTRW>, + Sched<[WriteShuffleLd, ReadAfterLd]>; + +// Insert +let Predicates = [HasAVX, NoBWI] in +defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V; + +let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in +defm PINSRW : sse2_pinsrw, PD; + +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Mask Creation +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in { + +def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), + (ins VR128:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", + [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))], + IIC_SSE_MOVMSK>, VEX; + +let Predicates = [HasAVX2] in { +def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), + (ins VR256:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", + [(set GR32orGR64:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, + VEX, VEX_L; +} + +def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", + [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))], + IIC_SSE_MOVMSK>; + +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// SSE2 - Conditional Store +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in { + +let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in +def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), + (ins VR128:$src, VR128:$mask), + "maskmovdqu\t{$mask, $src|$src, $mask}", + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)], + IIC_SSE_MASKMOV>, VEX; +let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in +def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), + (ins VR128:$src, VR128:$mask), + "maskmovdqu\t{$mask, $src|$src, $mask}", + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)], + IIC_SSE_MASKMOV>, VEX; + +let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in +def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), + "maskmovdqu\t{$mask, $src|$src, $mask}", + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)], + IIC_SSE_MASKMOV>; +let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in +def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), + "maskmovdqu\t{$mask, $src|$src, $mask}", + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)], + IIC_SSE_MASKMOV>; + +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// SSE2 - Move Doubleword/Quadword +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// +// Move Int Doubleword to Packed Double Int +// +def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, + VEX, Sched<[WriteMove]>; +def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector (loadi32 addr:$src))))], + IIC_SSE_MOVDQ>, + VEX, Sched<[WriteLoad]>; +def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector GR64:$src)))], + IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in +def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>; +let isCodeGenOnly = 1 in +def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert GR64:$src))], + IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; + +def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, + Sched<[WriteMove]>; +def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector (loadi32 addr:$src))))], + IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; +def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector GR64:$src)))], + IIC_SSE_MOVDQ>, Sched<[WriteMove]>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in +def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; +let isCodeGenOnly = 1 in +def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert GR64:$src))], + IIC_SSE_MOVDQ>, Sched<[WriteMove]>; + +//===---------------------------------------------------------------------===// +// Move Int Doubleword to Single Scalar +// +let isCodeGenOnly = 1 in { + def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert GR32:$src))], + IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; + + def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))], + IIC_SSE_MOVDQ>, + VEX, Sched<[WriteLoad]>; + def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert GR32:$src))], + IIC_SSE_MOVDQ>, Sched<[WriteMove]>; + + def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))], + IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; +} + +//===---------------------------------------------------------------------===// +// Move Packed Doubleword Int to Packed Double Int +// +def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (extractelt (v4i32 VR128:$src), + (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX, + Sched<[WriteMove]>; +def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), + (ins i32mem:$dst, VR128:$src), + "movd\t{$src, $dst|$dst, $src}", + [(store (i32 (extractelt (v4i32 VR128:$src), + (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, + VEX, Sched<[WriteStore]>; +def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (extractelt (v4i32 VR128:$src), + (iPTR 0)))], IIC_SSE_MOVD_ToGP>, + Sched<[WriteMove]>; +def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), + "movd\t{$src, $dst|$dst, $src}", + [(store (i32 (extractelt (v4i32 VR128:$src), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOVDQ>, Sched<[WriteStore]>; + +def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>; + +def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))), + (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>; + +def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>; + +def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), + (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>; + +//===---------------------------------------------------------------------===// +// Move Packed Doubleword Int first element to Doubleword Int +// +let SchedRW = [WriteMove] in { +def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (extractelt (v2i64 VR128:$src), + (iPTR 0)))], + IIC_SSE_MOVD_ToGP>, + VEX; + +def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (extractelt (v2i64 VR128:$src), + (iPTR 0)))], + IIC_SSE_MOVD_ToGP>; +} //SchedRW + +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in +def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs), + (ins i64mem:$dst, VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in +def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; + +//===---------------------------------------------------------------------===// +// Bitcast FR64 <-> GR64 +// +let isCodeGenOnly = 1 in { + let Predicates = [UseAVX] in + def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, + VEX, Sched<[WriteLoad]>; + def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (bitconvert FR64:$src))], + IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>; + def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (i64 (bitconvert FR64:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; + + def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))], + IIC_SSE_MOVDQ>, Sched<[WriteLoad]>; + def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (bitconvert FR64:$src))], + IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; + def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (i64 (bitconvert FR64:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, Sched<[WriteStore]>; +} + +//===---------------------------------------------------------------------===// +// Move Scalar Single to Double Int +// +let isCodeGenOnly = 1 in { + def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (bitconvert FR32:$src))], + IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>; + def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(store (i32 (bitconvert FR32:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; + def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (bitconvert FR32:$src))], + IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; + def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(store (i32 (bitconvert FR32:$src)), addr:$dst)], + IIC_SSE_MOVDQ>, Sched<[WriteStore]>; +} + +let Predicates = [UseAVX] in { + let AddedComplexity = 15 in { + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), + (VMOVDI2PDIrr GR32:$src)>; + + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), + (VMOV64toPQIrr GR64:$src)>; + + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>; + } + // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. + // These instructions also write zeros in the high part of a 256-bit register. + let AddedComplexity = 20 in { + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + (VMOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), + (VMOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + (VMOVDI2PDIrm addr:$src)>; + def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>; + } + // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. + def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>; +} + +let Predicates = [UseSSE2] in { + let AddedComplexity = 15 in { + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), + (MOVDI2PDIrr GR32:$src)>; + + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), + (MOV64toPQIrr GR64:$src)>; + } + let AddedComplexity = 20 in { + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + (MOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), + (MOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + (MOVDI2PDIrm addr:$src)>; + } +} + +// These are the correct encodings of the instructions so that we know how to +// read correct assembly, even though we continue to emit the wrong ones for +// compatibility with Darwin's buggy assembler. +def : InstAlias<"movq\t{$src, $dst|$dst, $src}", + (MOV64toPQIrr VR128:$dst, GR64:$src), 0>; +def : InstAlias<"movq\t{$src, $dst|$dst, $src}", + (MOVPQIto64rr GR64:$dst, VR128:$src), 0>; +// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX. +def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", + (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>; +def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", + (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>; + +//===---------------------------------------------------------------------===// +// SSE2 - Move Quadword +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// +// Move Quadword Int to Packed Quadword Int +// + +let ExeDomain = SSEPackedInt, SchedRW = [WriteLoad] in { +def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, + VEX, Requires<[UseAVX]>; +def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector (loadi64 addr:$src))))], + IIC_SSE_MOVDQ>, XS, + Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix +} // ExeDomain, SchedRW + +//===---------------------------------------------------------------------===// +// Move Packed Quadword Int to Quadword Int +// +let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in { +def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (i64 (extractelt (v2i64 VR128:$src), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOVDQ>, VEX; +def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (i64 (extractelt (v2i64 VR128:$src), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOVDQ>; +} // ExeDomain, SchedRW + +// For disassembler only +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, + SchedRW = [WriteVecLogic] in { +def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX; +def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>; +} + +//===---------------------------------------------------------------------===// +// Store / copy lower 64-bits of a XMM register. +// +let Predicates = [HasAVX] in +def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src), + (VMOVPQI2QImr addr:$dst, VR128:$src)>; +let Predicates = [UseSSE2] in +def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src), + (MOVPQI2QImr addr:$dst, VR128:$src)>; + +let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, AddedComplexity = 20 in { +def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (X86vzmovl (v2i64 (scalar_to_vector + (loadi64 addr:$src))))))], + IIC_SSE_MOVDQ>, + XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>; + +def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (X86vzmovl (v2i64 (scalar_to_vector + (loadi64 addr:$src))))))], + IIC_SSE_MOVDQ>, + XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>; +} // ExeDomain, isCodeGenOnly, AddedComplexity + +let Predicates = [UseAVX], AddedComplexity = 20 in { + def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), + (VMOVZQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzload addr:$src)), + (VMOVZQI2PQIrm addr:$src)>; + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>; +} + +let Predicates = [UseSSE2], AddedComplexity = 20 in { + def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), + (MOVZQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>; +} + +let Predicates = [HasAVX] in { +def : Pat<(v4i64 (alignedX86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>; +def : Pat<(v4i64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>; +} + +//===---------------------------------------------------------------------===// +// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in +// IA32 document. movq xmm1, xmm2 does clear the high bits. +// +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in { +let AddedComplexity = 15 in +def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))], + IIC_SSE_MOVQ_RR>, + XS, VEX, Requires<[UseAVX]>; +let AddedComplexity = 15 in +def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))], + IIC_SSE_MOVQ_RR>, + XS, Requires<[UseSSE2]>; +} // ExeDomain, SchedRW + +let ExeDomain = SSEPackedInt, isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in { +let AddedComplexity = 20 in +def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2i64 (X86vzmovl + (loadv2i64 addr:$src))))], + IIC_SSE_MOVDQ>, + XS, VEX, Requires<[UseAVX]>; +let AddedComplexity = 20 in { +def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2i64 (X86vzmovl + (loadv2i64 addr:$src))))], + IIC_SSE_MOVDQ>, + XS, Requires<[UseSSE2]>; +} +} // ExeDomain, isCodeGenOnly, SchedRW + +let AddedComplexity = 20 in { + let Predicates = [UseAVX] in { + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), + (VMOVZPQILo2PQIrr VR128:$src)>; + } + let Predicates = [UseSSE2] in { + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), + (MOVZPQILo2PQIrr VR128:$src)>; + } +} + +//===---------------------------------------------------------------------===// +// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP +//===---------------------------------------------------------------------===// +multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, + ValueType vt, RegisterClass RC, PatFrag mem_frag, + X86MemOperand x86memop> { +def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (vt (OpNode RC:$src)))], + IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; +def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (OpNode (mem_frag addr:$src)))], + IIC_SSE_MOV_LH>, Sched<[WriteLoad]>; +} + +let Predicates = [HasAVX, NoVLX] in { + defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", + v4f32, VR128, loadv4f32, f128mem>, VEX; + defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", + v4f32, VR128, loadv4f32, f128mem>, VEX; + defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", + v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L; + defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", + v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L; +} +defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, + memopv4f32, f128mem>; +defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, + memopv4f32, f128mem>; + +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v4i32 (X86Movshdup VR128:$src)), + (VMOVSHDUPrr VR128:$src)>; + def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))), + (VMOVSHDUPrm addr:$src)>; + def : Pat<(v4i32 (X86Movsldup VR128:$src)), + (VMOVSLDUPrr VR128:$src)>; + def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))), + (VMOVSLDUPrm addr:$src)>; + def : Pat<(v8i32 (X86Movshdup VR256:$src)), + (VMOVSHDUPYrr VR256:$src)>; + def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))), + (VMOVSHDUPYrm addr:$src)>; + def : Pat<(v8i32 (X86Movsldup VR256:$src)), + (VMOVSLDUPYrr VR256:$src)>; + def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))), + (VMOVSLDUPYrm addr:$src)>; +} + +let Predicates = [UseSSE3] in { + def : Pat<(v4i32 (X86Movshdup VR128:$src)), + (MOVSHDUPrr VR128:$src)>; + def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))), + (MOVSHDUPrm addr:$src)>; + def : Pat<(v4i32 (X86Movsldup VR128:$src)), + (MOVSLDUPrr VR128:$src)>; + def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))), + (MOVSLDUPrm addr:$src)>; +} + +//===---------------------------------------------------------------------===// +// SSE3 - Replicate Double FP - MOVDDUP +//===---------------------------------------------------------------------===// + +multiclass sse3_replicate_dfp<string OpcodeStr> { +def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))], + IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; +def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (v2f64 (X86Movddup + (scalar_to_vector (loadf64 addr:$src)))))], + IIC_SSE_MOV_LH>, Sched<[WriteLoad]>; +} + +// FIXME: Merge with above classe when there're patterns for the ymm version +multiclass sse3_replicate_dfp_y<string OpcodeStr> { +def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>, + Sched<[WriteFShuffle]>; +def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, + (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>, + Sched<[WriteLoad]>; +} + +let Predicates = [HasAVX, NoVLX] in { + defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX; + defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L; +} + +defm MOVDDUP : sse3_replicate_dfp<"movddup">; + + +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(X86Movddup (loadv2f64 addr:$src)), + (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + + // 256-bit version + def : Pat<(X86Movddup (loadv4i64 addr:$src)), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (v4i64 VR256:$src)), + (VMOVDDUPYrr VR256:$src)>; +} + +let Predicates = [HasAVX] in { + def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))), + (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))), + (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + def : Pat<(X86Movddup (bc_v2f64 + (v2i64 (scalar_to_vector (loadi64 addr:$src))))), + (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; +} + +let Predicates = [UseAVX, OptForSize] in { + def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + (VMOVDDUPrm addr:$src)>; + def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), + (VMOVDDUPrm addr:$src)>; +} + +let Predicates = [UseSSE3] in { + def : Pat<(X86Movddup (memopv2f64 addr:$src)), + (MOVDDUPrm addr:$src)>; + def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))), + (MOVDDUPrm addr:$src)>; + def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))), + (MOVDDUPrm addr:$src)>; + def : Pat<(X86Movddup (bc_v2f64 + (v2i64 (scalar_to_vector (loadi64 addr:$src))))), + (MOVDDUPrm addr:$src)>; +} + +//===---------------------------------------------------------------------===// +// SSE3 - Move Unaligned Integer +//===---------------------------------------------------------------------===// + +let SchedRW = [WriteLoad] in { +let Predicates = [HasAVX] in { + def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "vlddqu\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX; + def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "vlddqu\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, + VEX, VEX_L; +} +def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "lddqu\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))], + IIC_SSE_LDDQU>; +} + +//===---------------------------------------------------------------------===// +// SSE3 - Arithmetic +//===---------------------------------------------------------------------===// + +multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, OpndItins itins, + PatFrag ld_frag, bit Is2Addr = 1> { + def rr : I<0xD0, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>, + Sched<[itins.Sched]>; + def rm : I<0xD0, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))], itins.rr>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +let Predicates = [HasAVX] in { + let ExeDomain = SSEPackedSingle in { + defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128, + f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V; + defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256, + f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L; + } + let ExeDomain = SSEPackedDouble in { + defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128, + f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V; + defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256, + f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L; + } +} +let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { + let ExeDomain = SSEPackedSingle in + defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128, + f128mem, SSE_ALU_F32P, memopv4f32>, XD; + let ExeDomain = SSEPackedDouble in + defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128, + f128mem, SSE_ALU_F64P, memopv2f64>, PD; +} + +// Patterns used to select 'addsub' instructions. +let Predicates = [HasAVX] in { + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), + (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (loadv4f32 addr:$rhs))), + (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), + (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (loadv2f64 addr:$rhs))), + (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; + + def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))), + (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (loadv8f32 addr:$rhs))), + (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>; + def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))), + (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (loadv4f64 addr:$rhs))), + (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>; +} + +let Predicates = [UseSSE3] in { + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), + (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (memopv4f32 addr:$rhs))), + (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), + (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (memopv2f64 addr:$rhs))), + (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; +} + +//===---------------------------------------------------------------------===// +// SSE3 Instructions +//===---------------------------------------------------------------------===// + +// Horizontal ops +multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, + X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag, + bit Is2Addr = 1> { + def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>, + Sched<[WriteFAdd]>; + + def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))], + IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>; +} +multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, + X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag, + bit Is2Addr = 1> { + def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>, + Sched<[WriteFAdd]>; + + def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))], + IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>; +} + +let Predicates = [HasAVX] in { + let ExeDomain = SSEPackedSingle in { + defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, + X86fhadd, loadv4f32, 0>, VEX_4V; + defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, + X86fhsub, loadv4f32, 0>, VEX_4V; + defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, + X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L; + defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, + X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L; + } + let ExeDomain = SSEPackedDouble in { + defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem, + X86fhadd, loadv2f64, 0>, VEX_4V; + defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem, + X86fhsub, loadv2f64, 0>, VEX_4V; + defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem, + X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L; + defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem, + X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L; + } +} + +let Constraints = "$src1 = $dst" in { + let ExeDomain = SSEPackedSingle in { + defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd, + memopv4f32>; + defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub, + memopv4f32>; + } + let ExeDomain = SSEPackedDouble in { + defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd, + memopv2f64>; + defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub, + memopv2f64>; + } +} + +//===---------------------------------------------------------------------===// +// SSSE3 - Packed Absolute Instructions +//===---------------------------------------------------------------------===// + + +/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. +multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128, + PatFrag ld_frag> { + def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>, + Sched<[WriteVecALU]>; + + def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (IntId128 + (bitconvert (ld_frag addr:$src))))], IIC_SSE_PABS_RM>, + Sched<[WriteVecALULd]>; +} + +/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. +multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr, + Intrinsic IntId256> { + def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (IntId256 VR256:$src))]>, + Sched<[WriteVecALU]>; + + def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst), + (ins i256mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, + (IntId256 + (bitconvert (loadv4i64 addr:$src))))]>, + Sched<[WriteVecALULd]>; +} + +// Helper fragments to match sext vXi1 to vXiY. +def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)), + VR128:$src))>; +def v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>; +def v4i1sextv4i32 : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>; +def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)), + VR256:$src))>; +def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>; +def v8i1sextv8i32 : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>; + +let Predicates = [HasAVX] in { + defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", int_x86_ssse3_pabs_b_128, + loadv2i64>, VEX; + defm VPABSW : SS3I_unop_rm_int<0x1D, "vpabsw", int_x86_ssse3_pabs_w_128, + loadv2i64>, VEX; + defm VPABSD : SS3I_unop_rm_int<0x1E, "vpabsd", int_x86_ssse3_pabs_d_128, + loadv2i64>, VEX; + + def : Pat<(xor + (bc_v2i64 (v16i1sextv16i8)), + (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))), + (VPABSBrr128 VR128:$src)>; + def : Pat<(xor + (bc_v2i64 (v8i1sextv8i16)), + (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))), + (VPABSWrr128 VR128:$src)>; + def : Pat<(xor + (bc_v2i64 (v4i1sextv4i32)), + (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))), + (VPABSDrr128 VR128:$src)>; +} + +let Predicates = [HasAVX2] in { + defm VPABSB : SS3I_unop_rm_int_y<0x1C, "vpabsb", + int_x86_avx2_pabs_b>, VEX, VEX_L; + defm VPABSW : SS3I_unop_rm_int_y<0x1D, "vpabsw", + int_x86_avx2_pabs_w>, VEX, VEX_L; + defm VPABSD : SS3I_unop_rm_int_y<0x1E, "vpabsd", + int_x86_avx2_pabs_d>, VEX, VEX_L; + + def : Pat<(xor + (bc_v4i64 (v32i1sextv32i8)), + (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))), + (VPABSBrr256 VR256:$src)>; + def : Pat<(xor + (bc_v4i64 (v16i1sextv16i16)), + (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))), + (VPABSWrr256 VR256:$src)>; + def : Pat<(xor + (bc_v4i64 (v8i1sextv8i32)), + (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))), + (VPABSDrr256 VR256:$src)>; +} + +defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", int_x86_ssse3_pabs_b_128, + memopv2i64>; +defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", int_x86_ssse3_pabs_w_128, + memopv2i64>; +defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", int_x86_ssse3_pabs_d_128, + memopv2i64>; + +let Predicates = [HasSSSE3] in { + def : Pat<(xor + (bc_v2i64 (v16i1sextv16i8)), + (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))), + (PABSBrr128 VR128:$src)>; + def : Pat<(xor + (bc_v2i64 (v8i1sextv8i16)), + (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))), + (PABSWrr128 VR128:$src)>; + def : Pat<(xor + (bc_v2i64 (v4i1sextv4i32)), + (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))), + (PABSDrr128 VR128:$src)>; +} + +//===---------------------------------------------------------------------===// +// SSSE3 - Packed Binary Operator Instructions +//===---------------------------------------------------------------------===// + +let Sched = WriteVecALU in { +def SSE_PHADDSUBD : OpndItins< + IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM +>; +def SSE_PHADDSUBSW : OpndItins< + IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM +>; +def SSE_PHADDSUBW : OpndItins< + IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM +>; +} +let Sched = WriteShuffle in +def SSE_PSHUFB : OpndItins< + IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM +>; +let Sched = WriteVecALU in +def SSE_PSIGN : OpndItins< + IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM +>; +let Sched = WriteVecIMul in +def SSE_PMULHRSW : OpndItins< + IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW +>; + +/// SS3I_binop_rm - Simple SSSE3 bin op +multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, OpndItins itins, + bit Is2Addr = 1> { + let isCommutable = 1 in + def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>, + Sched<[itins.Sched]>; + def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)))))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. +multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, + Intrinsic IntId128, OpndItins itins, + PatFrag ld_frag, bit Is2Addr = 1> { + let isCommutable = 1 in + def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, + Sched<[itins.Sched]>; + def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (IntId128 VR128:$src1, + (bitconvert (ld_frag addr:$src2))))]>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, + Intrinsic IntId256, + X86FoldableSchedWrite Sched> { + let isCommutable = 1 in + def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, + Sched<[Sched]>; + def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>, + Sched<[Sched.Folded, ReadAfterLd]>; +} + +let ImmT = NoImm, Predicates = [HasAVX] in { +let isCommutable = 0 in { + defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128, + loadv2i64, i128mem, + SSE_PHADDSUBW, 0>, VEX_4V; + defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128, + loadv2i64, i128mem, + SSE_PHADDSUBD, 0>, VEX_4V; + defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128, + loadv2i64, i128mem, + SSE_PHADDSUBW, 0>, VEX_4V; + defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128, + loadv2i64, i128mem, + SSE_PHADDSUBD, 0>, VEX_4V; + defm VPSIGNB : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128, + loadv2i64, i128mem, + SSE_PSIGN, 0>, VEX_4V; + defm VPSIGNW : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128, + loadv2i64, i128mem, + SSE_PSIGN, 0>, VEX_4V; + defm VPSIGND : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128, + loadv2i64, i128mem, + SSE_PSIGN, 0>, VEX_4V; + defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128, + loadv2i64, i128mem, + SSE_PSHUFB, 0>, VEX_4V; + defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", + int_x86_ssse3_phadd_sw_128, + SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V; + defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", + int_x86_ssse3_phsub_sw_128, + SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V; + defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw", + int_x86_ssse3_pmadd_ub_sw_128, + SSE_PMADD, loadv2i64, 0>, VEX_4V; +} +defm VPMULHRSW : SS3I_binop_rm_int<0x0B, "vpmulhrsw", + int_x86_ssse3_pmul_hr_sw_128, + SSE_PMULHRSW, loadv2i64, 0>, VEX_4V; +} + +let ImmT = NoImm, Predicates = [HasAVX2] in { +let isCommutable = 0 in { + defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256, + loadv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; + defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256, + loadv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; + defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256, + loadv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; + defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256, + loadv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; + defm VPSIGNBY : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256, + loadv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; + defm VPSIGNWY : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256, + loadv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; + defm VPSIGNDY : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256, + loadv4i64, i256mem, + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; + defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256, + loadv4i64, i256mem, + SSE_PSHUFB, 0>, VEX_4V, VEX_L; + defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", + int_x86_avx2_phadd_sw, + WriteVecALU>, VEX_4V, VEX_L; + defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", + int_x86_avx2_phsub_sw, + WriteVecALU>, VEX_4V, VEX_L; + defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw", + int_x86_avx2_pmadd_ub_sw, + WriteVecIMul>, VEX_4V, VEX_L; +} +defm VPMULHRSW : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw", + int_x86_avx2_pmul_hr_sw, + WriteVecIMul>, VEX_4V, VEX_L; +} + +// None of these have i8 immediate fields. +let ImmT = NoImm, Constraints = "$src1 = $dst" in { +let isCommutable = 0 in { + defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128, + memopv2i64, i128mem, SSE_PHADDSUBW>; + defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128, + memopv2i64, i128mem, SSE_PHADDSUBD>; + defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128, + memopv2i64, i128mem, SSE_PHADDSUBW>; + defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128, + memopv2i64, i128mem, SSE_PHADDSUBD>; + defm PSIGNB : SS3I_binop_rm<0x08, "psignb", X86psign, v16i8, VR128, + memopv2i64, i128mem, SSE_PSIGN>; + defm PSIGNW : SS3I_binop_rm<0x09, "psignw", X86psign, v8i16, VR128, + memopv2i64, i128mem, SSE_PSIGN>; + defm PSIGND : SS3I_binop_rm<0x0A, "psignd", X86psign, v4i32, VR128, + memopv2i64, i128mem, SSE_PSIGN>; + defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128, + memopv2i64, i128mem, SSE_PSHUFB>; + defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", + int_x86_ssse3_phadd_sw_128, + SSE_PHADDSUBSW, memopv2i64>; + defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", + int_x86_ssse3_phsub_sw_128, + SSE_PHADDSUBSW, memopv2i64>; + defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw", + int_x86_ssse3_pmadd_ub_sw_128, + SSE_PMADD, memopv2i64>; +} +defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw", + int_x86_ssse3_pmul_hr_sw_128, + SSE_PMULHRSW, memopv2i64>; +} + +//===---------------------------------------------------------------------===// +// SSSE3 - Packed Align Instruction Patterns +//===---------------------------------------------------------------------===// + +multiclass ssse3_palignr<string asm, bit Is2Addr = 1> { + let hasSideEffects = 0 in { + def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>; + let mayLoad = 1 in + def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [], IIC_SSE_PALIGNRM>, Sched<[WriteShuffleLd, ReadAfterLd]>; + } +} + +multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> { + let hasSideEffects = 0 in { + def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, u8imm:$src3), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, Sched<[WriteShuffle]>; + let mayLoad = 1 in + def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2, u8imm:$src3), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, Sched<[WriteShuffleLd, ReadAfterLd]>; + } +} + +let Predicates = [HasAVX] in + defm VPALIGN : ssse3_palignr<"vpalignr", 0>, VEX_4V; +let Predicates = [HasAVX2] in + defm VPALIGN : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L; +let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in + defm PALIGN : ssse3_palignr<"palignr">; + +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { +def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; +} + +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { +def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; +def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; +def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; +def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; +} + +let Predicates = [UseSSSE3] in { +def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; +def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; +def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; +def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; +} + +//===---------------------------------------------------------------------===// +// SSSE3 - Thread synchronization +//===---------------------------------------------------------------------===// + +let SchedRW = [WriteSystem] in { +let usesCustomInserter = 1 in { +def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3), + [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>, + Requires<[HasSSE3]>; +} + +let Uses = [EAX, ECX, EDX] in +def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>, + TB, Requires<[HasSSE3]>; +let Uses = [ECX, EAX] in +def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", + [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>, + TB, Requires<[HasSSE3]>; +} // SchedRW + +def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>; +def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; + +def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>, + Requires<[Not64BitMode]>; +def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>, + Requires<[In64BitMode]>; + +//===----------------------------------------------------------------------===// +// SSE4.1 - Packed Move with Sign/Zero Extend +//===----------------------------------------------------------------------===// + +multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, + RegisterClass OutRC, RegisterClass InRC, + OpndItins itins> { + def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [], itins.rr>, + Sched<[itins.Sched]>; + + def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [], + itins.rm>, Sched<[itins.Sched.Folded]>; +} + +multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr, + X86MemOperand MemOp, X86MemOperand MemYOp, + OpndItins SSEItins, OpndItins AVXItins, + OpndItins AVX2Itins> { + defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>; + let Predicates = [HasAVX, NoVLX] in + defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp, + VR128, VR128, AVXItins>, VEX; + let Predicates = [HasAVX2, NoVLX] in + defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp, + VR256, VR128, AVX2Itins>, VEX, VEX_L; +} + +multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, + X86MemOperand MemOp, X86MemOperand MemYOp> { + defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr), + MemOp, MemYOp, + SSE_INTALU_ITINS_SHUFF_P, + DEFAULT_ITINS_SHUFFLESCHED, + DEFAULT_ITINS_SHUFFLESCHED>; + defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10), + !strconcat("pmovzx", OpcodeStr), + MemOp, MemYOp, + SSE_INTALU_ITINS_SHUFF_P, + DEFAULT_ITINS_SHUFFLESCHED, + DEFAULT_ITINS_SHUFFLESCHED>; +} + +defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem>; +defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem>; +defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem>; + +defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem>; +defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem>; + +defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem>; + +// AVX2 Patterns +multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> { + // Register-Register patterns + def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; + def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>; + def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>; + + def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), + (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; + def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))), + (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>; + + def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), + (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; + + // On AVX2, we also support 256bit inputs. + def : Pat<(v16i16 (ExtOp (v32i8 VR256:$src))), + (!cast<I>(OpcPrefix#BWYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v8i32 (ExtOp (v32i8 VR256:$src))), + (!cast<I>(OpcPrefix#BDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v4i64 (ExtOp (v32i8 VR256:$src))), + (!cast<I>(OpcPrefix#BQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + + def : Pat<(v8i32 (ExtOp (v16i16 VR256:$src))), + (!cast<I>(OpcPrefix#WDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v4i64 (ExtOp (v16i16 VR256:$src))), + (!cast<I>(OpcPrefix#WQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + + def : Pat<(v4i64 (ExtOp (v8i32 VR256:$src))), + (!cast<I>(OpcPrefix#DQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + + // Simple Register-Memory patterns + def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + + def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + + def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + + // AVX2 Register-Memory patterns + def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + + def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + + def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; +} + +let Predicates = [HasAVX2, NoVLX] in { + defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>; + defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>; +} + +// SSE4.1/AVX patterns. +multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, + SDNode ExtOp, PatFrag ExtLoad16> { + def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BWrr) VR128:$src)>; + def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BDrr) VR128:$src)>; + def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BQrr) VR128:$src)>; + + def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))), + (!cast<I>(OpcPrefix#WDrr) VR128:$src)>; + def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))), + (!cast<I>(OpcPrefix#WQrr) VR128:$src)>; + + def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), + (!cast<I>(OpcPrefix#DQrr) VR128:$src)>; + + def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + + def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + + def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + + def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + + def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + + def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + + def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + + def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + + def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; +} + +let Predicates = [HasAVX, NoVLX] in { + defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>; + defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>; +} + +let Predicates = [UseSSE41] in { + defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>; + defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>; +} + +//===----------------------------------------------------------------------===// +// SSE4.1 - Extract Instructions +//===----------------------------------------------------------------------===// + +/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem +multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { + def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), + (ins VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), + imm:$src2))]>, + Sched<[WriteShuffle]>; + let hasSideEffects = 0, mayStore = 1, + SchedRW = [WriteShuffleLd, WriteRMW] in + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins i8mem:$dst, VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1), + imm:$src2)))), addr:$dst)]>; +} + +let Predicates = [HasAVX, NoBWI] in + defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX; + +defm PEXTRB : SS41I_extract8<0x14, "pextrb">; + + +/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination +multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { + let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in + def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), + (ins VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[WriteShuffle]>; + + let hasSideEffects = 0, mayStore = 1, + SchedRW = [WriteShuffleLd, WriteRMW] in + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins i16mem:$dst, VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1), + imm:$src2)))), addr:$dst)]>; +} + +let Predicates = [HasAVX, NoBWI] in + defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX; + +defm PEXTRW : SS41I_extract16<0x15, "pextrw">; + + +/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination +multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { + def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), + (ins VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set GR32:$dst, + (extractelt (v4i32 VR128:$src1), imm:$src2))]>, + Sched<[WriteShuffle]>; + let SchedRW = [WriteShuffleLd, WriteRMW] in + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins i32mem:$dst, VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(store (extractelt (v4i32 VR128:$src1), imm:$src2), + addr:$dst)]>; +} + +let Predicates = [HasAVX, NoDQI] in + defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; + +defm PEXTRD : SS41I_extract32<0x16, "pextrd">; + +/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination +multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { + def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst), + (ins VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set GR64:$dst, + (extractelt (v2i64 VR128:$src1), imm:$src2))]>, + Sched<[WriteShuffle]>, REX_W; + let SchedRW = [WriteShuffleLd, WriteRMW] in + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins i64mem:$dst, VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(store (extractelt (v2i64 VR128:$src1), imm:$src2), + addr:$dst)]>, REX_W; +} + +let Predicates = [HasAVX, NoDQI] in + defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; + +defm PEXTRQ : SS41I_extract64<0x16, "pextrq">; + +/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory +/// destination +multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr, + OpndItins itins = DEFAULT_ITINS> { + def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), + (ins VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set GR32orGR64:$dst, + (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))], + itins.rr>, Sched<[WriteFBlend]>; + let SchedRW = [WriteFBlendLd, WriteRMW] in + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins f32mem:$dst, VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), + addr:$dst)], itins.rm>; +} + +let ExeDomain = SSEPackedSingle in { + let Predicates = [UseAVX] in + defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX; + defm EXTRACTPS : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>; +} + +// Also match an EXTRACTPS store when the store is done as f32 instead of i32. +def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), + imm:$src2))), + addr:$dst), + (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, + Requires<[HasAVX]>; +def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), + imm:$src2))), + addr:$dst), + (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, + Requires<[UseSSE41]>; + +//===----------------------------------------------------------------------===// +// SSE4.1 - Insert Instructions +//===----------------------------------------------------------------------===// + +multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { + def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, + Sched<[WriteShuffle]>; + def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i8mem:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), + imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +let Predicates = [HasAVX, NoBWI] in + defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V; +let Constraints = "$src1 = $dst" in + defm PINSRB : SS41I_insert8<0x20, "pinsrb">; + +multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { + def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, GR32:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, + Sched<[WriteShuffle]>; + def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i32mem:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), + imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +let Predicates = [HasAVX, NoDQI] in + defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; +let Constraints = "$src1 = $dst" in + defm PINSRD : SS41I_insert32<0x22, "pinsrd">; + +multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { + def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, GR64:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, + Sched<[WriteShuffle]>; + def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i64mem:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), + imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; +} + +let Predicates = [HasAVX, NoDQI] in + defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; +let Constraints = "$src1 = $dst" in + defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; + +// insertps has a few different modes, there's the first two here below which +// are optimized inserts that won't zero arbitrary elements in the destination +// vector. The next one matches the intrinsic and could zero arbitrary elements +// in the target vector. +multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1, + OpndItins itins = DEFAULT_ITINS> { + def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>, + Sched<[WriteFShuffle]>; + def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f32mem:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (X86insertps VR128:$src1, + (v4f32 (scalar_to_vector (loadf32 addr:$src2))), + imm:$src3))], itins.rm>, + Sched<[WriteFShuffleLd, ReadAfterLd]>; +} + +let ExeDomain = SSEPackedSingle in { + let Predicates = [UseAVX] in + defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V; + let Constraints = "$src1 = $dst" in + defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>; +} + +let Predicates = [UseSSE41] in { + // If we're inserting an element from a load or a null pshuf of a load, + // fold the load into the insertps instruction. + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32 + (scalar_to_vector (loadf32 addr:$src2))), (i8 0)), + imm:$src3)), + (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd + (loadv4f32 addr:$src2), (i8 0)), imm:$src3)), + (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; +} + +let Predicates = [UseAVX] in { + // If we're inserting an element from a vbroadcast of a load, fold the + // load into the X86insertps instruction. + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), + (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)), + (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), + (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)), + (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; +} + +//===----------------------------------------------------------------------===// +// SSE4.1 - Round Instructions +//===----------------------------------------------------------------------===// + +multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, + PatFrag mem_frag32, PatFrag mem_frag64, + Intrinsic V4F32Int, Intrinsic V2F64Int> { +let ExeDomain = SSEPackedSingle in { + // Intrinsic operation, reg. + // Vector intrinsic operation, reg + def PSr : SS4AIi8<opcps, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))], + IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>; + + // Vector intrinsic operation, mem + def PSm : SS4AIi8<opcps, MRMSrcMem, + (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (V4F32Int (mem_frag32 addr:$src1),imm:$src2))], + IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>; +} // ExeDomain = SSEPackedSingle + +let ExeDomain = SSEPackedDouble in { + // Vector intrinsic operation, reg + def PDr : SS4AIi8<opcpd, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))], + IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>; + + // Vector intrinsic operation, mem + def PDm : SS4AIi8<opcpd, MRMSrcMem, + (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (V2F64Int (mem_frag64 addr:$src1),imm:$src2))], + IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>; +} // ExeDomain = SSEPackedDouble +} + +multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd, + string OpcodeStr, + Intrinsic F32Int, + Intrinsic F64Int, bit Is2Addr = 1> { +let ExeDomain = GenericDomain in { + // Operation, reg. + let hasSideEffects = 0 in + def SSr : SS4AIi8<opcss, MRMSrcReg, + (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + []>, Sched<[WriteFAdd]>; + + // Intrinsic operation, reg. + let isCodeGenOnly = 1 in + def SSr_Int : SS4AIi8<opcss, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>, + Sched<[WriteFAdd]>; + + // Intrinsic operation, mem. + def SSm : SS4AIi8<opcss, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>, + Sched<[WriteFAddLd, ReadAfterLd]>; + + // Operation, reg. + let hasSideEffects = 0 in + def SDr : SS4AIi8<opcsd, MRMSrcReg, + (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + []>, Sched<[WriteFAdd]>; + + // Intrinsic operation, reg. + let isCodeGenOnly = 1 in + def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>, + Sched<[WriteFAdd]>; + + // Intrinsic operation, mem. + def SDm : SS4AIi8<opcsd, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>, + Sched<[WriteFAddLd, ReadAfterLd]>; +} // ExeDomain = GenericDomain +} + +// FP round - roundss, roundps, roundsd, roundpd +let Predicates = [HasAVX] in { + // Intrinsic form + defm VROUND : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128, + loadv4f32, loadv2f64, + int_x86_sse41_round_ps, + int_x86_sse41_round_pd>, VEX; + defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256, + loadv8f32, loadv4f64, + int_x86_avx_round_ps_256, + int_x86_avx_round_pd_256>, VEX, VEX_L; + defm VROUND : sse41_fp_binop_rm<0x0A, 0x0B, "vround", + int_x86_sse41_round_ss, + int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG; +} + +let Predicates = [UseAVX] in { + def : Pat<(ffloor FR32:$src), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>; + def : Pat<(f64 (ffloor FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>; + def : Pat<(f32 (fnearbyint FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; + def : Pat<(f64 (fnearbyint FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; + def : Pat<(f32 (fceil FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>; + def : Pat<(f64 (fceil FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>; + def : Pat<(f32 (frint FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; + def : Pat<(f64 (frint FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; + def : Pat<(f32 (ftrunc FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>; + def : Pat<(f64 (ftrunc FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>; +} + +let Predicates = [HasAVX] in { + def : Pat<(v4f32 (ffloor VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0x9))>; + def : Pat<(v4f32 (fnearbyint VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0xC))>; + def : Pat<(v4f32 (fceil VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0xA))>; + def : Pat<(v4f32 (frint VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0x4))>; + def : Pat<(v4f32 (ftrunc VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0xB))>; + + def : Pat<(v2f64 (ffloor VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0x9))>; + def : Pat<(v2f64 (fnearbyint VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0xC))>; + def : Pat<(v2f64 (fceil VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0xA))>; + def : Pat<(v2f64 (frint VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0x4))>; + def : Pat<(v2f64 (ftrunc VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0xB))>; + + def : Pat<(v8f32 (ffloor VR256:$src)), + (VROUNDYPSr VR256:$src, (i32 0x9))>; + def : Pat<(v8f32 (fnearbyint VR256:$src)), + (VROUNDYPSr VR256:$src, (i32 0xC))>; + def : Pat<(v8f32 (fceil VR256:$src)), + (VROUNDYPSr VR256:$src, (i32 0xA))>; + def : Pat<(v8f32 (frint VR256:$src)), + (VROUNDYPSr VR256:$src, (i32 0x4))>; + def : Pat<(v8f32 (ftrunc VR256:$src)), + (VROUNDYPSr VR256:$src, (i32 0xB))>; + + def : Pat<(v4f64 (ffloor VR256:$src)), + (VROUNDYPDr VR256:$src, (i32 0x9))>; + def : Pat<(v4f64 (fnearbyint VR256:$src)), + (VROUNDYPDr VR256:$src, (i32 0xC))>; + def : Pat<(v4f64 (fceil VR256:$src)), + (VROUNDYPDr VR256:$src, (i32 0xA))>; + def : Pat<(v4f64 (frint VR256:$src)), + (VROUNDYPDr VR256:$src, (i32 0x4))>; + def : Pat<(v4f64 (ftrunc VR256:$src)), + (VROUNDYPDr VR256:$src, (i32 0xB))>; +} + +defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128, + memopv4f32, memopv2f64, + int_x86_sse41_round_ps, int_x86_sse41_round_pd>; +let Constraints = "$src1 = $dst" in +defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", + int_x86_sse41_round_ss, int_x86_sse41_round_sd>; + +let Predicates = [UseSSE41] in { + def : Pat<(ffloor FR32:$src), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>; + def : Pat<(f64 (ffloor FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>; + def : Pat<(f32 (fnearbyint FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; + def : Pat<(f64 (fnearbyint FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; + def : Pat<(f32 (fceil FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>; + def : Pat<(f64 (fceil FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>; + def : Pat<(f32 (frint FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; + def : Pat<(f64 (frint FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; + def : Pat<(f32 (ftrunc FR32:$src)), + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>; + def : Pat<(f64 (ftrunc FR64:$src)), + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>; + + def : Pat<(v4f32 (ffloor VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0x9))>; + def : Pat<(v4f32 (fnearbyint VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0xC))>; + def : Pat<(v4f32 (fceil VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0xA))>; + def : Pat<(v4f32 (frint VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0x4))>; + def : Pat<(v4f32 (ftrunc VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0xB))>; + + def : Pat<(v2f64 (ffloor VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0x9))>; + def : Pat<(v2f64 (fnearbyint VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0xC))>; + def : Pat<(v2f64 (fceil VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0xA))>; + def : Pat<(v2f64 (frint VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0x4))>; + def : Pat<(v2f64 (ftrunc VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0xB))>; +} + +//===----------------------------------------------------------------------===// +// SSE4.1 - Packed Bit Test +//===----------------------------------------------------------------------===// + +// ptest instruction we'll lower to this in X86ISelLowering primarily from +// the intel intrinsic that corresponds to this. +let Defs = [EFLAGS], Predicates = [HasAVX] in { +def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, + Sched<[WriteVecLogic]>, VEX; +def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>, + Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX; + +def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>, + Sched<[WriteVecLogic]>, VEX, VEX_L; +def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>, + Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L; +} + +let Defs = [EFLAGS] in { +def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), + "ptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, + Sched<[WriteVecLogic]>; +def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), + "ptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, + Sched<[WriteVecLogicLd, ReadAfterLd]>; +} + +// The bit test instructions below are AVX only +multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> { + def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, + Sched<[WriteVecLogic]>, VEX; + def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>, + Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX; +} + +let Defs = [EFLAGS], Predicates = [HasAVX] in { +let ExeDomain = SSEPackedSingle in { +defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>; +defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>, + VEX_L; +} +let ExeDomain = SSEPackedDouble in { +defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>; +defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>, + VEX_L; +} +} + +//===----------------------------------------------------------------------===// +// SSE4.1 - Misc Instructions +//===----------------------------------------------------------------------===// + +let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { + def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "popcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)], + IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, + OpSize16, XS; + def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "popcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctpop (loadi16 addr:$src))), + (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, + Sched<[WriteFAddLd]>, OpSize16, XS; + + def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "popcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)], + IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, + OpSize32, XS; + + def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "popcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctpop (loadi32 addr:$src))), + (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, + Sched<[WriteFAddLd]>, OpSize32, XS; + + def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "popcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)], + IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, XS; + def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "popcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctpop (loadi64 addr:$src))), + (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, + Sched<[WriteFAddLd]>, XS; +} + + + +// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. +multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, + Intrinsic IntId128, PatFrag ld_frag, + X86FoldableSchedWrite Sched> { + def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (IntId128 VR128:$src))]>, + Sched<[Sched]>; + def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (IntId128 (bitconvert (ld_frag addr:$src))))]>, + Sched<[Sched.Folded]>; +} + +// PHMIN has the same profile as PSAD, thus we use the same scheduling +// model, although the naming is misleading. +let Predicates = [HasAVX] in +defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw", + int_x86_sse41_phminposuw, loadv2i64, + WriteVecIMul>, VEX; +defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw", + int_x86_sse41_phminposuw, memopv2i64, + WriteVecIMul>; + +/// SS48I_binop_rm - Simple SSE41 binary operator. +multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, bit Is2Addr = 1, + OpndItins itins = SSE_INTALU_ITINS_P> { + let isCommutable = 1 in + def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, + Sched<[itins.Sched]>; + def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst +/// types. +multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType DstVT, ValueType SrcVT, RegisterClass RC, + PatFrag memop_frag, X86MemOperand x86memop, + OpndItins itins, + bit IsCommutable = 0, bit Is2Addr = 1> { + let isCommutable = IsCommutable in + def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, + Sched<[itins.Sched]>; + def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), + (bitconvert (memop_frag addr:$src2)))))]>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +let Predicates = [HasAVX, NoVLX] in { + defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128, + loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V; + defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128, + loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V; + defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128, + loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V; + defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128, + loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V; + defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128, + loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V; + defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128, + loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V; + defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128, + loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V; + defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128, + loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V; + defm VPMULDQ : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32, + VR128, loadv2i64, i128mem, + SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; +} + +let Predicates = [HasAVX2, NoVLX] in { + defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256, + loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V, VEX_L; + defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256, + loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V, VEX_L; + defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256, + loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V, VEX_L; + defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256, + loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V, VEX_L; + defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256, + loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V, VEX_L; + defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256, + loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V, VEX_L; + defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256, + loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V, VEX_L; + defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256, + loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V, VEX_L; + defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32, + VR256, loadv4i64, i256mem, + SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L; +} + +let Constraints = "$src1 = $dst" in { + defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128, + memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; + defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128, + memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; + defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128, + memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; + defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128, + memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; + defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128, + memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; + defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128, + memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; + defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128, + memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; + defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128, + memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>; + defm PMULDQ : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32, + VR128, memopv2i64, i128mem, + SSE_INTMUL_ITINS_P, 1>; +} + +let Predicates = [HasAVX, NoVLX] in { + defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, + memopv2i64, i128mem, 0, SSE_PMULLD_ITINS>, + VEX_4V; + defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, + memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V; +} +let Predicates = [HasAVX2] in { + defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, + loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>, + VEX_4V, VEX_L; + defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, + loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + VEX_4V, VEX_L; +} + +let Constraints = "$src1 = $dst" in { + defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, + memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>; + defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128, + memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>; +} + +/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate +multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, + Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, bit Is2Addr = 1, + OpndItins itins = DEFAULT_ITINS> { + let isCommutable = 1 in + def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>, + Sched<[itins.Sched]>; + def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, + (IntId RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate +multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, bit Is2Addr = 1, + OpndItins itins = DEFAULT_ITINS> { + let isCommutable = 1 in + def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))], + itins.rr>, Sched<[itins.Sched]>; + def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +let Predicates = [HasAVX] in { + let isCommutable = 0 in { + defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, + VR128, loadv2i64, i128mem, 0, + DEFAULT_ITINS_MPSADSCHED>, VEX_4V; + } + + let ExeDomain = SSEPackedSingle in { + defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32, + VR128, loadv4f32, f128mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; + defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32, + VR256, loadv8f32, f256mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L; + } + let ExeDomain = SSEPackedDouble in { + defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64, + VR128, loadv2f64, f128mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; + defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64, + VR256, loadv4f64, f256mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L; + } + defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16, + VR128, loadv2i64, i128mem, 0, + DEFAULT_ITINS_BLENDSCHED>, VEX_4V; + + let ExeDomain = SSEPackedSingle in + defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, + VR128, loadv4f32, f128mem, 0, + SSE_DPPS_ITINS>, VEX_4V; + let ExeDomain = SSEPackedDouble in + defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, + VR128, loadv2f64, f128mem, 0, + SSE_DPPS_ITINS>, VEX_4V; + let ExeDomain = SSEPackedSingle in + defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, + VR256, loadv8f32, i256mem, 0, + SSE_DPPS_ITINS>, VEX_4V, VEX_L; +} + +let Predicates = [HasAVX2] in { + let isCommutable = 0 in { + defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, + VR256, loadv4i64, i256mem, 0, + DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L; + } + defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16, + VR256, loadv4i64, i256mem, 0, + DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L; +} + +let Constraints = "$src1 = $dst" in { + let isCommutable = 0 in { + defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, + VR128, memopv2i64, i128mem, + 1, SSE_MPSADBW_ITINS>; + } + let ExeDomain = SSEPackedSingle in + defm BLENDPS : SS41I_binop_rmi<0x0C, "blendps", X86Blendi, v4f32, + VR128, memopv4f32, f128mem, + 1, SSE_INTALU_ITINS_FBLEND_P>; + let ExeDomain = SSEPackedDouble in + defm BLENDPD : SS41I_binop_rmi<0x0D, "blendpd", X86Blendi, v2f64, + VR128, memopv2f64, f128mem, + 1, SSE_INTALU_ITINS_FBLEND_P>; + defm PBLENDW : SS41I_binop_rmi<0x0E, "pblendw", X86Blendi, v8i16, + VR128, memopv2i64, i128mem, + 1, SSE_INTALU_ITINS_BLEND_P>; + let ExeDomain = SSEPackedSingle in + defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, + VR128, memopv4f32, f128mem, 1, + SSE_DPPS_ITINS>; + let ExeDomain = SSEPackedDouble in + defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, + VR128, memopv2f64, f128mem, 1, + SSE_DPPD_ITINS>; +} + +/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators +multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, + RegisterClass RC, X86MemOperand x86memop, + PatFrag mem_frag, Intrinsic IntId, + X86FoldableSchedWrite Sched> { + def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))], + NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM, + Sched<[Sched]>; + + def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, + (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)), + RC:$src3))], + NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM, + Sched<[Sched.Folded, ReadAfterLd]>; +} + +let Predicates = [HasAVX] in { +let ExeDomain = SSEPackedDouble in { +defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem, + loadv2f64, int_x86_sse41_blendvpd, + WriteFVarBlend>; +defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem, + loadv4f64, int_x86_avx_blendv_pd_256, + WriteFVarBlend>, VEX_L; +} // ExeDomain = SSEPackedDouble +let ExeDomain = SSEPackedSingle in { +defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem, + loadv4f32, int_x86_sse41_blendvps, + WriteFVarBlend>; +defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem, + loadv8f32, int_x86_avx_blendv_ps_256, + WriteFVarBlend>, VEX_L; +} // ExeDomain = SSEPackedSingle +defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, + loadv2i64, int_x86_sse41_pblendvb, + WriteVarBlend>; +} + +let Predicates = [HasAVX2] in { +defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem, + loadv4i64, int_x86_avx2_pblendvb, + WriteVarBlend>, VEX_L; +} + +let Predicates = [HasAVX] in { + def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1), + (v16i8 VR128:$src2))), + (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1), + (v4i32 VR128:$src2))), + (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1), + (v4f32 VR128:$src2))), + (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1), + (v2i64 VR128:$src2))), + (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1), + (v2f64 VR128:$src2))), + (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1), + (v8i32 VR256:$src2))), + (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; + def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1), + (v8f32 VR256:$src2))), + (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; + def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1), + (v4i64 VR256:$src2))), + (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; + def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1), + (v4f64 VR256:$src2))), + (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; +} + +let Predicates = [HasAVX2] in { + def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1), + (v32i8 VR256:$src2))), + (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>; +} + +// Patterns +// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or +// on targets where they have equal performance. These were changed to use +// blends because blends have better throughput on SandyBridge and Haswell, but +// movs[s/d] are 1-2 byte shorter instructions. +let Predicates = [UseAVX] in { + let AddedComplexity = 15 in { + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVS{S,D} to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), + (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), + (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; + + // Move low f32 and clear high bits. + def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), + (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>; + + // Move low f64 and clear high bits. + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>; + } + + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), + (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), + sub_xmm)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i64 0), + (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), + sub_xmm)>; + + // These will incur an FP/int domain crossing penalty, but it may be the only + // way without AVX2. Do not add any complexity because we may be able to match + // more optimal patterns defined earlier in this file. + def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), + (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>; + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), + (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>; +} + +// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or +// on targets where they have equal performance. These were changed to use +// blends because blends have better throughput on SandyBridge and Haswell, but +// movs[s/d] are 1-2 byte shorter instructions. +let Predicates = [UseSSE41] in { + // With SSE41 we can use blends for these patterns. + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), + (BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>; +} + + +/// SS41I_ternary_int - SSE 4.1 ternary operator +let Uses = [XMM0], Constraints = "$src1 = $dst" in { + multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag, + X86MemOperand x86memop, Intrinsic IntId, + OpndItins itins = DEFAULT_ITINS> { + def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))], + itins.rr>, Sched<[itins.Sched]>; + + def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, x86memop:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, + (IntId VR128:$src1, + (bitconvert (mem_frag addr:$src2)), XMM0))], + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; + } +} + +let ExeDomain = SSEPackedDouble in +defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem, + int_x86_sse41_blendvpd, + DEFAULT_ITINS_FBLENDSCHED>; +let ExeDomain = SSEPackedSingle in +defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem, + int_x86_sse41_blendvps, + DEFAULT_ITINS_FBLENDSCHED>; +defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem, + int_x86_sse41_pblendvb, + DEFAULT_ITINS_VARBLENDSCHED>; + +// Aliases with the implicit xmm0 argument +def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", + (BLENDVPDrr0 VR128:$dst, VR128:$src2)>; +def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", + (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>; +def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", + (BLENDVPSrr0 VR128:$dst, VR128:$src2)>; +def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", + (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>; +def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", + (PBLENDVBrr0 VR128:$dst, VR128:$src2)>; +def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", + (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>; + +let Predicates = [UseSSE41] in { + def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1), + (v16i8 VR128:$src2))), + (PBLENDVBrr0 VR128:$src2, VR128:$src1)>; + def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1), + (v4i32 VR128:$src2))), + (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; + def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1), + (v4f32 VR128:$src2))), + (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; + def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1), + (v2i64 VR128:$src2))), + (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; + def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1), + (v2f64 VR128:$src2))), + (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; +} + +let SchedRW = [WriteLoad] in { +let Predicates = [HasAVX] in +def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, + VEX; +let Predicates = [HasAVX2] in +def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>, + VEX, VEX_L; +def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movntdqa\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// SSE4.2 - Compare Instructions +//===----------------------------------------------------------------------===// + +/// SS42I_binop_rm - Simple SSE 4.2 binary operator +multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, bit Is2Addr = 1> { + def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>; + def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>; +} + +let Predicates = [HasAVX] in + defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, + loadv2i64, i128mem, 0>, VEX_4V; + +let Predicates = [HasAVX2] in + defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, + loadv4i64, i256mem, 0>, VEX_4V, VEX_L; + +let Constraints = "$src1 = $dst" in + defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, + memopv2i64, i128mem>; + +//===----------------------------------------------------------------------===// +// SSE4.2 - String/text Processing Instructions +//===----------------------------------------------------------------------===// + +// Packed Compare Implicit Length Strings, Return Mask +multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> { + def REG : PseudoI<(outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), + [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2, + imm:$src3))]>; + def MEM : PseudoI<(outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), + [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, + (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>; +} + +let Defs = [EFLAGS], usesCustomInserter = 1 in { + defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>, + Requires<[HasAVX]>; + defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>, + Requires<[UseSSE42]>; +} + +multiclass pcmpistrm_SS42AI<string asm> { + def rr : SS42AI<0x62, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), + []>, Sched<[WritePCmpIStrM]>; + let mayLoad = 1 in + def rm :SS42AI<0x62, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), + []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>; +} + +let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { + let Predicates = [HasAVX] in + defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX; + defm PCMPISTRM128 : pcmpistrm_SS42AI<"pcmpistrm"> ; +} + +// Packed Compare Explicit Length Strings, Return Mask +multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> { + def REG : PseudoI<(outs VR128:$dst), + (ins VR128:$src1, VR128:$src3, u8imm:$src5), + [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 + VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; + def MEM : PseudoI<(outs VR128:$dst), + (ins VR128:$src1, i128mem:$src3, u8imm:$src5), + [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX, + (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>; +} + +let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { + defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>, + Requires<[HasAVX]>; + defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", memopv2i64>, + Requires<[UseSSE42]>; +} + +multiclass SS42AI_pcmpestrm<string asm> { + def rr : SS42AI<0x60, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src3, u8imm:$src5), + !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), + []>, Sched<[WritePCmpEStrM]>; + let mayLoad = 1 in + def rm : SS42AI<0x60, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src3, u8imm:$src5), + !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), + []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>; +} + +let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { + let Predicates = [HasAVX] in + defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX; + defm PCMPESTRM128 : SS42AI_pcmpestrm<"pcmpestrm">; +} + +// Packed Compare Implicit Length Strings, Return Index +multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> { + def REG : PseudoI<(outs GR32:$dst), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), + [(set GR32:$dst, EFLAGS, + (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>; + def MEM : PseudoI<(outs GR32:$dst), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), + [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1, + (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>; +} + +let Defs = [EFLAGS], usesCustomInserter = 1 in { + defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>, + Requires<[HasAVX]>; + defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>, + Requires<[UseSSE42]>; +} + +multiclass SS42AI_pcmpistri<string asm> { + def rr : SS42AI<0x63, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), + []>, Sched<[WritePCmpIStrI]>; + let mayLoad = 1 in + def rm : SS42AI<0x63, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), + []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>; +} + +let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { + let Predicates = [HasAVX] in + defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX; + defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; +} + +// Packed Compare Explicit Length Strings, Return Index +multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> { + def REG : PseudoI<(outs GR32:$dst), + (ins VR128:$src1, VR128:$src3, u8imm:$src5), + [(set GR32:$dst, EFLAGS, + (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; + def MEM : PseudoI<(outs GR32:$dst), + (ins VR128:$src1, i128mem:$src3, u8imm:$src5), + [(set GR32:$dst, EFLAGS, + (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX, + imm:$src5))]>; +} + +let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { + defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>, + Requires<[HasAVX]>; + defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI", memopv2i64>, + Requires<[UseSSE42]>; +} + +multiclass SS42AI_pcmpestri<string asm> { + def rr : SS42AI<0x61, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src3, u8imm:$src5), + !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), + []>, Sched<[WritePCmpEStrI]>; + let mayLoad = 1 in + def rm : SS42AI<0x61, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src3, u8imm:$src5), + !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), + []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>; +} + +let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { + let Predicates = [HasAVX] in + defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX; + defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; +} + +//===----------------------------------------------------------------------===// +// SSE4.2 - CRC Instructions +//===----------------------------------------------------------------------===// + +// No CRC instructions have AVX equivalents + +// crc intrinsic instruction +// This set of instructions are only rm, the only difference is the size +// of r and m. +class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut, + RegisterClass RCIn, SDPatternOperator Int> : + SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2), + !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), + [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>, + Sched<[WriteFAdd]>; + +class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut, + X86MemOperand x86memop, SDPatternOperator Int> : + SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2), + !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), + [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))], + IIC_CRC32_MEM>, Sched<[WriteFAddLd, ReadAfterLd]>; + +let Constraints = "$src1 = $dst" in { + def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem, + int_x86_sse42_crc32_32_8>; + def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8, + int_x86_sse42_crc32_32_8>; + def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem, + int_x86_sse42_crc32_32_16>, OpSize16; + def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16, + int_x86_sse42_crc32_32_16>, OpSize16; + def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem, + int_x86_sse42_crc32_32_32>, OpSize32; + def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32, + int_x86_sse42_crc32_32_32>, OpSize32; + def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem, + int_x86_sse42_crc32_64_64>, REX_W; + def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64, + int_x86_sse42_crc32_64_64>, REX_W; + let hasSideEffects = 0 in { + let mayLoad = 1 in + def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem, + null_frag>, REX_W; + def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8, + null_frag>, REX_W; + } +} + +//===----------------------------------------------------------------------===// +// SHA-NI Instructions +//===----------------------------------------------------------------------===// + +multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, + bit UsesXMM0 = 0> { + def rr : I<Opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [!if(UsesXMM0, + (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)), + (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8; + + def rm : I<Opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [!if(UsesXMM0, + (set VR128:$dst, (IntId VR128:$src1, + (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)), + (set VR128:$dst, (IntId VR128:$src1, + (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8; +} + +let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { + def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), + "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, + (i8 imm:$src3)))]>, TA; + def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), + "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (int_x86_sha1rnds4 VR128:$src1, + (bc_v4i32 (memopv2i64 addr:$src2)), + (i8 imm:$src3)))]>, TA; + + defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>; + defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>; + defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>; + + let Uses=[XMM0] in + defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>; + + defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>; + defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>; +} + +// Aliases with explicit %xmm0 +def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", + (SHA256RNDS2rr VR128:$dst, VR128:$src2)>; +def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", + (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>; + +//===----------------------------------------------------------------------===// +// AES-NI Instructions +//===----------------------------------------------------------------------===// + +multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128, + PatFrag ld_frag, bit Is2Addr = 1> { + def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, + Sched<[WriteAESDecEnc]>; + def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>, + Sched<[WriteAESDecEncLd, ReadAfterLd]>; +} + +// Perform One Round of an AES Encryption/Decryption Flow +let Predicates = [HasAVX, HasAES] in { + defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", + int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V; + defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", + int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V; + defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", + int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V; + defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", + int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V; +} + +let Constraints = "$src1 = $dst" in { + defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", + int_x86_aesni_aesenc, memopv2i64>; + defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", + int_x86_aesni_aesenclast, memopv2i64>; + defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", + int_x86_aesni_aesdec, memopv2i64>; + defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", + int_x86_aesni_aesdeclast, memopv2i64>; +} + +// Perform the AES InvMixColumn Transformation +let Predicates = [HasAVX, HasAES] in { + def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1), + "vaesimc\t{$src1, $dst|$dst, $src1}", + [(set VR128:$dst, + (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>, + VEX; + def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1), + "vaesimc\t{$src1, $dst|$dst, $src1}", + [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>, + Sched<[WriteAESIMCLd]>, VEX; +} +def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1), + "aesimc\t{$src1, $dst|$dst, $src1}", + [(set VR128:$dst, + (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>; +def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1), + "aesimc\t{$src1, $dst|$dst, $src1}", + [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>, + Sched<[WriteAESIMCLd]>; + +// AES Round Key Generation Assist +let Predicates = [HasAVX, HasAES] in { + def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, u8imm:$src2), + "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, + Sched<[WriteAESKeyGen]>, VEX; + def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1, u8imm:$src2), + "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>, + Sched<[WriteAESKeyGenLd]>, VEX; +} +def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, u8imm:$src2), + "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, + Sched<[WriteAESKeyGen]>; +def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1, u8imm:$src2), + "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>, + Sched<[WriteAESKeyGenLd]>; + +//===----------------------------------------------------------------------===// +// PCLMUL Instructions +//===----------------------------------------------------------------------===// + +// AVX carry-less Multiplication instructions +let isCommutable = 1 in +def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), + "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR128:$dst, + (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>, + Sched<[WriteCLMul]>; + +def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), + "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, + (loadv2i64 addr:$src2), imm:$src3))]>, + Sched<[WriteCLMulLd, ReadAfterLd]>; + +// Carry-less Multiplication instructions +let Constraints = "$src1 = $dst" in { +let isCommutable = 1 in +def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), + "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))], + IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>; + +def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), + "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, + (memopv2i64 addr:$src2), imm:$src3))], + IIC_SSE_PCLMULQDQ_RM>, + Sched<[WriteCLMulLd, ReadAfterLd]>; +} // Constraints = "$src1 = $dst" + + +multiclass pclmul_alias<string asm, int immop> { + def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"), + (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>; + + def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"), + (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>; + + def : InstAlias<!strconcat("vpclmul", asm, + "dq {$src2, $src1, $dst|$dst, $src1, $src2}"), + (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop), + 0>; + + def : InstAlias<!strconcat("vpclmul", asm, + "dq {$src2, $src1, $dst|$dst, $src1, $src2}"), + (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop), + 0>; +} +defm : pclmul_alias<"hqhq", 0x11>; +defm : pclmul_alias<"hqlq", 0x01>; +defm : pclmul_alias<"lqhq", 0x10>; +defm : pclmul_alias<"lqlq", 0x00>; + +//===----------------------------------------------------------------------===// +// SSE4A Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [HasSSE4A] in { + +let Constraints = "$src = $dst" in { +def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), + (ins VR128:$src, u8imm:$len, u8imm:$idx), + "extrq\t{$idx, $len, $src|$src, $len, $idx}", + [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len, + imm:$idx))]>, PD; +def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src, VR128:$mask), + "extrq\t{$mask, $src|$src, $mask}", + [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src, + VR128:$mask))]>, PD; + +def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), + "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", + [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2, + imm:$len, imm:$idx))]>, XD; +def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src, VR128:$mask), + "insertq\t{$mask, $src|$src, $mask}", + [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src, + VR128:$mask))]>, XD; +} + +def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), + "movntss\t{$src, $dst|$dst, $src}", + [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS; + +def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movntsd\t{$src, $dst|$dst, $src}", + [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD; +} + +//===----------------------------------------------------------------------===// +// AVX Instructions +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// VBROADCAST - Load from memory and broadcast to all elements of the +// destination operand +// +class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, ValueType VT, + PatFrag ld_frag, SchedWrite Sched> : + AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>, + Sched<[Sched]>, VEX { + let mayLoad = 1; +} + +// AVX2 adds register forms +class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC, + ValueType ResVT, ValueType OpVT, SchedWrite Sched> : + AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>, + Sched<[Sched]>, VEX; + +let ExeDomain = SSEPackedSingle in { + def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, + f32mem, v4f32, loadf32, WriteLoad>; + def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, + f32mem, v8f32, loadf32, + WriteFShuffleLd>, VEX_L; +} +let ExeDomain = SSEPackedDouble in +def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, + v4f64, loadf64, WriteFShuffleLd>, VEX_L; + +let ExeDomain = SSEPackedSingle in { + def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128, + v4f32, v4f32, WriteFShuffle>; + def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256, + v8f32, v4f32, WriteFShuffle256>, VEX_L; +} +let ExeDomain = SSEPackedDouble in +def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, + v4f64, v2f64, WriteFShuffle256>, VEX_L; + +let mayLoad = 1, Predicates = [HasAVX2] in +def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), + (ins i128mem:$src), + "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, + Sched<[WriteLoad]>, VEX, VEX_L; + +def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), + (ins f128mem:$src), + "vbroadcastf128\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_vbroadcastf128_pd_256 addr:$src))]>, + Sched<[WriteFShuffleLd]>, VEX, VEX_L; + +let Predicates = [HasAVX] in +def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), + (VBROADCASTF128 addr:$src)>; + + +//===----------------------------------------------------------------------===// +// VINSERTF128 - Insert packed floating-point values +// +let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { +def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR128:$src2, u8imm:$src3), + "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L; +let mayLoad = 1 in +def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f128mem:$src2, u8imm:$src3), + "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L; +} + +let Predicates = [HasAVX, NoVLX] in { +def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), + (iPTR imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2), + (iPTR imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; + +def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2), + (iPTR imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2), + (iPTR imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; +} + +let Predicates = [HasAVX1Only] in { +def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), + (iPTR imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), + (iPTR imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), + (iPTR imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), + (iPTR imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; + +def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2), + (iPTR imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), + (bc_v4i32 (loadv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), + (bc_v16i8 (loadv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), + (bc_v8i16 (loadv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTF128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; +} + +//===----------------------------------------------------------------------===// +// VEXTRACTF128 - Extract packed floating-point values +// +let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { +def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), + (ins VR256:$src1, u8imm:$src2), + "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, Sched<[WriteFShuffle]>, VEX, VEX_L; +let mayStore = 1 in +def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), + (ins f128mem:$dst, VR256:$src1, u8imm:$src2), + "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, Sched<[WriteStore]>, VEX, VEX_L; +} + +// AVX1 patterns +let Predicates = [HasAVX] in { +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), + (v4f32 (VEXTRACTF128rr + (v8f32 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), + (v2f64 (VEXTRACTF128rr + (v4f64 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; + +def : Pat<(store (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(store (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextract128_imm VR128:$ext))>; +} + +let Predicates = [HasAVX1Only] in { +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), + (v2i64 (VEXTRACTF128rr + (v4i64 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), + (v4i32 (VEXTRACTF128rr + (v8i32 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), + (v8i16 (VEXTRACTF128rr + (v16i16 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), + (v16i8 (VEXTRACTF128rr + (v32i8 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; + +def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTF128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextract128_imm VR128:$ext))>; +} + +//===----------------------------------------------------------------------===// +// VMASKMOV - Conditional SIMD Packed Loads and Stores +// +multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, + Intrinsic IntLd, Intrinsic IntLd256, + Intrinsic IntSt, Intrinsic IntSt256> { + def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, + VEX_4V; + def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, + VEX_4V, VEX_L; + def mr : AVX8I<opc_mr, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V; + def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L; +} + +let ExeDomain = SSEPackedSingle in +defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", + int_x86_avx_maskload_ps, + int_x86_avx_maskload_ps_256, + int_x86_avx_maskstore_ps, + int_x86_avx_maskstore_ps_256>; +let ExeDomain = SSEPackedDouble in +defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", + int_x86_avx_maskload_pd, + int_x86_avx_maskload_pd_256, + int_x86_avx_maskstore_pd, + int_x86_avx_maskstore_pd_256>; + +//===----------------------------------------------------------------------===// +// VPERMIL - Permute Single and Double Floating-Point Values +// +multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, + RegisterClass RC, X86MemOperand x86memop_f, + X86MemOperand x86memop_i, PatFrag i_frag, + Intrinsic IntVar, ValueType vt> { + def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V, + Sched<[WriteFShuffle]>; + def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop_i:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (IntVar RC:$src1, + (bitconvert (i_frag addr:$src2))))]>, VEX_4V, + Sched<[WriteFShuffleLd, ReadAfterLd]>; + + let Predicates = [HasAVX, NoVLX] in { + def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, + Sched<[WriteFShuffle]>; + def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), + (ins x86memop_f:$src1, u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX, + Sched<[WriteFShuffleLd]>; + }// Predicates = [HasAVX, NoVLX] +} + +let ExeDomain = SSEPackedSingle in { + defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, + loadv2i64, int_x86_avx_vpermilvar_ps, v4f32>; + defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, + loadv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L; +} +let ExeDomain = SSEPackedDouble in { + defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, + loadv2i64, int_x86_avx_vpermilvar_pd, v2f64>; + defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, + loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L; +} + +let Predicates = [HasAVX, NoVLX] in { +def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))), + (VPERMILPSYrr VR256:$src1, VR256:$src2)>; +def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), + (VPERMILPSYrm VR256:$src1, addr:$src2)>; +def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))), + (VPERMILPDYrr VR256:$src1, VR256:$src2)>; +def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))), + (VPERMILPDYrm VR256:$src1, addr:$src2)>; + +def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), + (VPERMILPSYri VR256:$src1, imm:$imm)>; +def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), + (VPERMILPDYri VR256:$src1, imm:$imm)>; +def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)), + (i8 imm:$imm))), + (VPERMILPSYmi addr:$src1, imm:$imm)>; +def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))), + (VPERMILPDYmi addr:$src1, imm:$imm)>; + +def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))), + (VPERMILPSrr VR128:$src1, VR128:$src2)>; +def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))), + (VPERMILPSrm VR128:$src1, addr:$src2)>; +def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))), + (VPERMILPDrr VR128:$src1, VR128:$src2)>; +def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))), + (VPERMILPDrm VR128:$src1, addr:$src2)>; + +def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))), + (VPERMILPDri VR128:$src1, imm:$imm)>; +def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))), + (VPERMILPDmi addr:$src1, imm:$imm)>; +} + +//===----------------------------------------------------------------------===// +// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks +// +let ExeDomain = SSEPackedSingle in { +def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, u8imm:$src3), + "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2, + (i8 imm:$src3))))]>, VEX_4V, VEX_L, + Sched<[WriteFShuffle]>; +def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, u8imm:$src3), + "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2), + (i8 imm:$src3)))]>, VEX_4V, VEX_L, + Sched<[WriteFShuffleLd, ReadAfterLd]>; +} + +let Predicates = [HasAVX] in { +def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, + (loadv4f64 addr:$src2), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +} + +let Predicates = [HasAVX1Only] in { +def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; + +def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, + (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, + (loadv4i64 addr:$src2), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, + (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, + (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +} + +//===----------------------------------------------------------------------===// +// VZERO - Zero YMM registers +// +let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, + YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { + // Zero All YMM registers + def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", + [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>; + + // Zero Upper bits of YMM registers + def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", + [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>; +} + +//===----------------------------------------------------------------------===// +// Half precision conversion instructions +//===----------------------------------------------------------------------===// +multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { + def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), + "vcvtph2ps\t{$src, $dst|$dst, $src}", + [(set RC:$dst, (Int VR128:$src))]>, + T8PD, VEX, Sched<[WriteCvtF2F]>; + let hasSideEffects = 0, mayLoad = 1 in + def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX, + Sched<[WriteCvtF2FLd]>; +} + +multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { + def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), + (ins RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>, + TAPD, VEX, Sched<[WriteCvtF2F]>; + let hasSideEffects = 0, mayStore = 1, + SchedRW = [WriteCvtF2FLd, WriteRMW] in + def mr : Ii8<0x1D, MRMDestMem, (outs), + (ins x86memop:$dst, RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + TAPD, VEX; +} + +let Predicates = [HasF16C] in { + defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>; + defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L; + defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>; + defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L; + + // Pattern match vcvtph2ps of a scalar i64 load. + def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)), + (VCVTPH2PSrm addr:$src)>; + def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)), + (VCVTPH2PSrm addr:$src)>; + + def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16 + (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), + addr:$dst), + (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; + def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16 + (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), + addr:$dst), + (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; + def : Pat<(store (v8i16 (int_x86_vcvtps2ph_256 VR256:$src1, i32:$src2)), + addr:$dst), + (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>; +} + +// Patterns for matching conversions from float to half-float and vice versa. +let Predicates = [HasF16C] in { + def : Pat<(fp_to_f16 FR32:$src), + (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr + (COPY_TO_REGCLASS FR32:$src, VR128), 0)), sub_16bit))>; + + def : Pat<(f16_to_fp GR16:$src), + (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr + (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >; + + def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))), + (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr + (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), FR32)) >; +} + +//===----------------------------------------------------------------------===// +// AVX2 Instructions +//===----------------------------------------------------------------------===// + +/// AVX2_binop_rmi - AVX2 binary operator with 8-bit immediate +multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop> { + let isCommutable = 1 in + def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, + Sched<[WriteBlend]>, VEX_4V; + def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, + Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V; +} + +defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32, + VR128, loadv2i64, i128mem>; +defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32, + VR256, loadv4i64, i256mem>, VEX_L; + +//===----------------------------------------------------------------------===// +// VPBROADCAST - Load from memory and broadcast to all elements of the +// destination operand +// +multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, PatFrag ld_frag, + ValueType OpVT128, ValueType OpVT256, Predicate prd> { + let Predicates = [HasAVX2, prd] in { + def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>, + Sched<[WriteShuffle]>, VEX; + def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>, + Sched<[WriteLoad]>, VEX; + def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, + (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>, + Sched<[WriteShuffle256]>, VEX, VEX_L; + def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, + (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>, + Sched<[WriteLoad]>, VEX, VEX_L; + + // Provide aliases for broadcast from the same register class that + // automatically does the extract. + def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))), + (!cast<Instruction>(NAME#"Yrr") + (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>; + } +} + +defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, + v16i8, v32i8, NoVLX_Or_NoBWI>; +defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, + v8i16, v16i16, NoVLX_Or_NoBWI>; +defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, + v4i32, v8i32, NoVLX>; +defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, + v2i64, v4i64, NoVLX>; + +let Predicates = [HasAVX2] in { + // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. + // This means we'll encounter truncated i32 loads; match that here. + def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWYrm addr:$src)>; + def : Pat<(v8i16 (X86VBroadcast + (i16 (trunc (i32 (zextloadi16 addr:$src)))))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast + (i16 (trunc (i32 (zextloadi16 addr:$src)))))), + (VPBROADCASTWYrm addr:$src)>; + + // Provide aliases for broadcast from the same register class that + // automatically does the extract. + def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), + (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), + sub_xmm)))>; + def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))), + (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), + sub_xmm)))>; + + // Provide fallback in case the load node that is used in the patterns above + // is used by additional users, which prevents the pattern selection. + let AddedComplexity = 20 in { + def : Pat<(v4f32 (X86VBroadcast FR32:$src)), + (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v8f32 (X86VBroadcast FR32:$src)), + (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f64 (X86VBroadcast FR64:$src)), + (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>; + + def : Pat<(v4i32 (X86VBroadcast GR32:$src)), + (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>; + def : Pat<(v8i32 (X86VBroadcast GR32:$src)), + (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>; + def : Pat<(v4i64 (X86VBroadcast GR64:$src)), + (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>; + + def : Pat<(v16i8 (X86VBroadcast GR8:$src)), + (VPBROADCASTBrr (COPY_TO_REGCLASS + (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), + VR128))>; + def : Pat<(v32i8 (X86VBroadcast GR8:$src)), + (VPBROADCASTBYrr (COPY_TO_REGCLASS + (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), + VR128))>; + + def : Pat<(v8i16 (X86VBroadcast GR16:$src)), + (VPBROADCASTWrr (COPY_TO_REGCLASS + (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)), + VR128))>; + def : Pat<(v16i16 (X86VBroadcast GR16:$src)), + (VPBROADCASTWYrr (COPY_TO_REGCLASS + (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)), + VR128))>; + + // The patterns for VPBROADCASTD are not needed because they would match + // the exact same thing as VBROADCASTSS patterns. + + def : Pat<(v2i64 (X86VBroadcast GR64:$src)), + (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>; + // The v4i64 pattern is not needed because VBROADCASTSDYrr already match. + } +} + +// AVX1 broadcast patterns +let Predicates = [HasAVX1Only] in { +def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), + (VBROADCASTSSYrm addr:$src)>; +def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), + (VBROADCASTSDYrm addr:$src)>; +def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), + (VBROADCASTSSrm addr:$src)>; +} + +let Predicates = [HasAVX] in { + // Provide fallback in case the load node that is used in the patterns above + // is used by additional users, which prevents the pattern selection. + let AddedComplexity = 20 in { + // 128bit broadcasts: + def : Pat<(v4f32 (X86VBroadcast FR32:$src)), + (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>; + def : Pat<(v8f32 (X86VBroadcast FR32:$src)), + (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), + (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm), + (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>; + def : Pat<(v4f64 (X86VBroadcast FR64:$src)), + (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), + (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm), + (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>; + + def : Pat<(v4i32 (X86VBroadcast GR32:$src)), + (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>; + def : Pat<(v8i32 (X86VBroadcast GR32:$src)), + (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm), + (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>; + def : Pat<(v4i64 (X86VBroadcast GR64:$src)), + (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), + (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm), + (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>; + } + + def : Pat<(v2f64 (X86VBroadcast f64:$src)), + (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2i64 (X86VBroadcast i64:$src)), + (VMOVDDUPrr (COPY_TO_REGCLASS GR64:$src, VR128))>; +} + +//===----------------------------------------------------------------------===// +// VPERM - Permute instructions +// + +multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, + ValueType OpVT, X86FoldableSchedWrite Sched> { + def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, + Sched<[Sched]>, VEX_4V, VEX_L; + def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OpVT (X86VPermv VR256:$src1, + (bitconvert (mem_frag addr:$src2)))))]>, + Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L; +} + +defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>; +let ExeDomain = SSEPackedSingle in +defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>; + +multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, + ValueType OpVT, X86FoldableSchedWrite Sched> { + def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, + Sched<[Sched]>, VEX, VEX_L; + def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), + (ins i256mem:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OpVT (X86VPermi (mem_frag addr:$src1), + (i8 imm:$src2))))]>, + Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L; +} + +defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64, + WriteShuffle256>, VEX_W; +let ExeDomain = SSEPackedDouble in +defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, + WriteFShuffle256>, VEX_W; + +//===----------------------------------------------------------------------===// +// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks +// +def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, u8imm:$src3), + "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, + (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>, + VEX_4V, VEX_L; +def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, u8imm:$src3), + "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2), + (i8 imm:$src3)))]>, + Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; + +let Predicates = [HasAVX2] in { +def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; + +def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)), + (i8 imm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, + (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; +def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)), + (i8 imm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; +} + + +//===----------------------------------------------------------------------===// +// VINSERTI128 - Insert packed integer values +// +let hasSideEffects = 0 in { +def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR128:$src2, u8imm:$src3), + "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L; +let mayLoad = 1 in +def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i128mem:$src2, u8imm:$src3), + "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; +} + +let Predicates = [HasAVX2, NoVLX] in { +def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), + (iPTR imm)), + (VINSERTI128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), + (iPTR imm)), + (VINSERTI128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), + (iPTR imm)), + (VINSERTI128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), + (iPTR imm)), + (VINSERTI128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; + +def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2), + (iPTR imm)), + (VINSERTI128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), + (bc_v4i32 (loadv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTI128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), + (bc_v16i8 (loadv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTI128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; +def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), + (bc_v8i16 (loadv2i64 addr:$src2)), + (iPTR imm)), + (VINSERTI128rm VR256:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; +} + +//===----------------------------------------------------------------------===// +// VEXTRACTI128 - Extract packed integer values +// +def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), + (ins VR256:$src1, u8imm:$src2), + "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + Sched<[WriteShuffle256]>, VEX, VEX_L; +let hasSideEffects = 0, mayStore = 1 in +def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), + (ins i128mem:$dst, VR256:$src1, u8imm:$src2), + "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + Sched<[WriteStore]>, VEX, VEX_L; + +let Predicates = [HasAVX2] in { +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), + (v2i64 (VEXTRACTI128rr + (v4i64 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), + (v4i32 (VEXTRACTI128rr + (v8i32 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), + (v8i16 (VEXTRACTI128rr + (v16i16 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; +def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), + (v16i8 (VEXTRACTI128rr + (v32i8 VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; + +def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTI128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTI128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTI128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextract128_imm VR128:$ext))>; +def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1), + (iPTR imm))), addr:$dst), + (VEXTRACTI128mr addr:$dst, VR256:$src1, + (EXTRACT_get_vextract128_imm VR128:$ext))>; +} + +//===----------------------------------------------------------------------===// +// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores +// +multiclass avx2_pmovmask<string OpcodeStr, + Intrinsic IntLd128, Intrinsic IntLd256, + Intrinsic IntSt128, Intrinsic IntSt256> { + def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V; + def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, + VEX_4V, VEX_L; + def mr : AVX28I<0x8e, MRMDestMem, (outs), + (ins i128mem:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V; + def Ymr : AVX28I<0x8e, MRMDestMem, (outs), + (ins i256mem:$dst, VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L; +} + +defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", + int_x86_avx2_maskload_d, + int_x86_avx2_maskload_d_256, + int_x86_avx2_maskstore_d, + int_x86_avx2_maskstore_d_256>; +defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", + int_x86_avx2_maskload_q, + int_x86_avx2_maskload_q_256, + int_x86_avx2_maskstore_q, + int_x86_avx2_maskstore_q_256>, VEX_W; + +def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)), + (VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)), + (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)), + (VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>; + +def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)), + (VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>; + +def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), + (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), + (bc_v8f32 (v8i32 immAllZerosV)))), + (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src0))), + (VBLENDVPSYrr VR256:$src0, (VMASKMOVPSYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + +def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), + (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 immAllZerosV))), + (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src0))), + (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + +def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)), + (VMASKMOVPSrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), + (bc_v4f32 (v4i32 immAllZerosV)))), + (VMASKMOVPSrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src0))), + (VBLENDVPSrr VR128:$src0, (VMASKMOVPSrm VR128:$mask, addr:$ptr), + VR128:$mask)>; + +def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)), + (VPMASKMOVDrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 immAllZerosV))), + (VPMASKMOVDrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0))), + (VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr), + VR128:$mask)>; + +def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)), + (VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)), + (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), + (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), + (v4f64 immAllZerosV))), + (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src0))), + (VBLENDVPDYrr VR256:$src0, (VMASKMOVPDYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + +def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), + (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), + (bc_v4i64 (v8i32 immAllZerosV)))), + (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0))), + (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + +def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)), + (VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>; + +def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)), + (VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>; + +def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)), + (VMASKMOVPDrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), + (v2f64 immAllZerosV))), + (VMASKMOVPDrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src0))), + (VBLENDVPDrr VR128:$src0, (VMASKMOVPDrm VR128:$mask, addr:$ptr), + VR128:$mask)>; + +def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)), + (VPMASKMOVQrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), + (bc_v2i64 (v4i32 immAllZerosV)))), + (VPMASKMOVQrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src0))), + (VBLENDVPDrr VR128:$src0, (VPMASKMOVQrm VR128:$mask, addr:$ptr), + VR128:$mask)>; + +//===----------------------------------------------------------------------===// +// Variable Bit Shifts +// +multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType vt128, ValueType vt256> { + def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>, + VEX_4V, Sched<[WriteVarVecShift]>; + def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode VR128:$src1, + (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>, + VEX_4V, Sched<[WriteVarVecShiftLd, ReadAfterLd]>; + def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>, + VEX_4V, VEX_L, Sched<[WriteVarVecShift]>; + def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (vt256 (OpNode VR256:$src1, + (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>, + VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>; +} + +let Predicates = [HasAVX2, NoVLX] in { + defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>; + defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W; + defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>; + defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W; + defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>; +} +//===----------------------------------------------------------------------===// +// VGATHER - GATHER Operations +multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256, + X86MemOperand memop128, X86MemOperand memop256> { + def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb), + (ins VR128:$src1, memop128:$src2, VR128:$mask), + !strconcat(OpcodeStr, + "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), + []>, VEX_4VOp3; + def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb), + (ins RC256:$src1, memop256:$src2, RC256:$mask), + !strconcat(OpcodeStr, + "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), + []>, VEX_4VOp3, VEX_L; +} + +let mayLoad = 1, Constraints + = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" + in { + defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W; + defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W; + defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>; + defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>; + + let ExeDomain = SSEPackedDouble in { + defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W; + defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W; + } + + let ExeDomain = SSEPackedSingle in { + defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>; + defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>; + } +} + +//===----------------------------------------------------------------------===// +// Extra selection patterns for FR128, f128, f128mem + +// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2. +def : Pat<(store (f128 FR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>; + +def : Pat<(loadf128 addr:$src), + (COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>; + +// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2 +def : Pat<(X86fand FR128:$src1, (loadf128 addr:$src2)), + (COPY_TO_REGCLASS + (ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), + FR128)>; + +def : Pat<(X86fand FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(and FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(X86for FR128:$src1, (loadf128 addr:$src2)), + (COPY_TO_REGCLASS + (ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), + FR128)>; + +def : Pat<(X86for FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(or FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(X86fxor FR128:$src1, (loadf128 addr:$src2)), + (COPY_TO_REGCLASS + (XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), + FR128)>; + +def : Pat<(X86fxor FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(xor FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrSVM.td b/contrib/llvm/lib/Target/X86/X86InstrSVM.td new file mode 100644 index 0000000..c847be7e --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrSVM.td @@ -0,0 +1,62 @@ +//===-- X86InstrSVM.td - SVM Instruction Set Extension -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the AMD SVM instruction +// set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// SVM instructions + +// 0F 01 D9 +def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", []>, TB; + +// 0F 01 DC +def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", []>, TB; + +// 0F 01 DD +def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB; + +// 0F 01 DE +let Uses = [EAX] in +def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB; + +// 0F 01 D8 +let Uses = [EAX] in +def VMRUN32 : I<0x01, MRM_D8, (outs), (ins), + "vmrun\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>; +let Uses = [RAX] in +def VMRUN64 : I<0x01, MRM_D8, (outs), (ins), + "vmrun\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>; + +// 0F 01 DA +let Uses = [EAX] in +def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins), + "vmload\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>; +let Uses = [RAX] in +def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins), + "vmload\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>; + +// 0F 01 DB +let Uses = [EAX] in +def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins), + "vmsave\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>; +let Uses = [RAX] in +def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins), + "vmsave\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>; + +// 0F 01 DF +let Uses = [EAX, ECX] in +def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins), + "invlpga\t{%ecx, %eax|eax, ecx}", []>, TB, Requires<[Not64BitMode]>; +let Uses = [RAX, ECX] in +def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins), + "invlpga\t{%ecx, %rax|rax, ecx}", []>, TB, Requires<[In64BitMode]>; + diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td new file mode 100644 index 0000000..c1df978 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -0,0 +1,969 @@ +//===-- X86InstrShiftRotate.td - Shift and Rotate Instrs ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the shift and rotate instructions. +// +//===----------------------------------------------------------------------===// + +// FIXME: Someone needs to smear multipattern goodness all over this file. + +let Defs = [EFLAGS] in { + +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { +let Uses = [CL] in { +def SHL8rCL : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1), + "shl{b}\t{%cl, $dst|$dst, cl}", + [(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>; +def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1), + "shl{w}\t{%cl, $dst|$dst, cl}", + [(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize16; +def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1), + "shl{l}\t{%cl, $dst|$dst, cl}", + [(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>, OpSize32; +def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1), + "shl{q}\t{%cl, $dst|$dst, cl}", + [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>; +} // Uses = [CL] + +def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), + "shl{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>; + +let isConvertibleToThreeAddress = 1 in { // Can transform into LEA. +def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), + "shl{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))], IIC_SR>, + OpSize16; +def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), + "shl{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>, + OpSize32; +def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst), + (ins GR64:$src1, u8imm:$src2), + "shl{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))], + IIC_SR>; +} // isConvertibleToThreeAddress = 1 + +// NOTE: We don't include patterns for shifts of a register by one, because +// 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one). +let hasSideEffects = 0 in { +def SHL8r1 : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1), + "shl{b}\t$dst", [], IIC_SR>; +def SHL16r1 : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1), + "shl{w}\t$dst", [], IIC_SR>, OpSize16; +def SHL32r1 : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1), + "shl{l}\t$dst", [], IIC_SR>, OpSize32; +def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1), + "shl{q}\t$dst", [], IIC_SR>; +} // hasSideEffects = 0 +} // Constraints = "$src = $dst", SchedRW + + +let SchedRW = [WriteShiftLd, WriteRMW] in { +// FIXME: Why do we need an explicit "Uses = [CL]" when the instr has a pattern +// using CL? +let Uses = [CL] in { +def SHL8mCL : I<0xD2, MRM4m, (outs), (ins i8mem :$dst), + "shl{b}\t{%cl, $dst|$dst, cl}", + [(store (shl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; +def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst), + "shl{w}\t{%cl, $dst|$dst, cl}", + [(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, + OpSize16; +def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst), + "shl{l}\t{%cl, $dst|$dst, cl}", + [(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>, + OpSize32; +def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst), + "shl{q}\t{%cl, $dst|$dst, cl}", + [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; +} +def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, u8imm:$src), + "shl{b}\t{$src, $dst|$dst, $src}", + [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; +def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, u8imm:$src), + "shl{w}\t{$src, $dst|$dst, $src}", + [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>, OpSize16; +def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, u8imm:$src), + "shl{l}\t{$src, $dst|$dst, $src}", + [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>, OpSize32; +def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, u8imm:$src), + "shl{q}\t{$src, $dst|$dst, $src}", + [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; + +// Shift by 1 +def SHL8m1 : I<0xD0, MRM4m, (outs), (ins i8mem :$dst), + "shl{b}\t$dst", + [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +def SHL16m1 : I<0xD1, MRM4m, (outs), (ins i16mem:$dst), + "shl{w}\t$dst", + [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>, OpSize16; +def SHL32m1 : I<0xD1, MRM4m, (outs), (ins i32mem:$dst), + "shl{l}\t$dst", + [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>, OpSize32; +def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst), + "shl{q}\t$dst", + [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +} // SchedRW + +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { +let Uses = [CL] in { +def SHR8rCL : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1), + "shr{b}\t{%cl, $dst|$dst, cl}", + [(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>; +def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1), + "shr{w}\t{%cl, $dst|$dst, cl}", + [(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize16; +def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1), + "shr{l}\t{%cl, $dst|$dst, cl}", + [(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>, OpSize32; +def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1), + "shr{q}\t{%cl, $dst|$dst, cl}", + [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>; +} + +def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$src2), + "shr{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))], IIC_SR>; +def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), + "shr{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))], + IIC_SR>, OpSize16; +def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), + "shr{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))], + IIC_SR>, OpSize32; +def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$src2), + "shr{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))], IIC_SR>; + +// Shift right by 1 +def SHR8r1 : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1), + "shr{b}\t$dst", + [(set GR8:$dst, (srl GR8:$src1, (i8 1)))], IIC_SR>; +def SHR16r1 : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1), + "shr{w}\t$dst", + [(set GR16:$dst, (srl GR16:$src1, (i8 1)))], IIC_SR>, OpSize16; +def SHR32r1 : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1), + "shr{l}\t$dst", + [(set GR32:$dst, (srl GR32:$src1, (i8 1)))], IIC_SR>, OpSize32; +def SHR64r1 : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1), + "shr{q}\t$dst", + [(set GR64:$dst, (srl GR64:$src1, (i8 1)))], IIC_SR>; +} // Constraints = "$src = $dst", SchedRW + + +let SchedRW = [WriteShiftLd, WriteRMW] in { +let Uses = [CL] in { +def SHR8mCL : I<0xD2, MRM5m, (outs), (ins i8mem :$dst), + "shr{b}\t{%cl, $dst|$dst, cl}", + [(store (srl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>; +def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst), + "shr{w}\t{%cl, $dst|$dst, cl}", + [(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, + OpSize16; +def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst), + "shr{l}\t{%cl, $dst|$dst, cl}", + [(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>, + OpSize32; +def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst), + "shr{q}\t{%cl, $dst|$dst, cl}", + [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; +} +def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, u8imm:$src), + "shr{b}\t{$src, $dst|$dst, $src}", + [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; +def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, u8imm:$src), + "shr{w}\t{$src, $dst|$dst, $src}", + [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>, OpSize16; +def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, u8imm:$src), + "shr{l}\t{$src, $dst|$dst, $src}", + [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>, OpSize32; +def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, u8imm:$src), + "shr{q}\t{$src, $dst|$dst, $src}", + [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; + +// Shift by 1 +def SHR8m1 : I<0xD0, MRM5m, (outs), (ins i8mem :$dst), + "shr{b}\t$dst", + [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +def SHR16m1 : I<0xD1, MRM5m, (outs), (ins i16mem:$dst), + "shr{w}\t$dst", + [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>, OpSize16; +def SHR32m1 : I<0xD1, MRM5m, (outs), (ins i32mem:$dst), + "shr{l}\t$dst", + [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>, OpSize32; +def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst), + "shr{q}\t$dst", + [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +} // SchedRW + +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { +let Uses = [CL] in { +def SAR8rCL : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1), + "sar{b}\t{%cl, $dst|$dst, cl}", + [(set GR8:$dst, (sra GR8:$src1, CL))], + IIC_SR>; +def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1), + "sar{w}\t{%cl, $dst|$dst, cl}", + [(set GR16:$dst, (sra GR16:$src1, CL))], + IIC_SR>, OpSize16; +def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1), + "sar{l}\t{%cl, $dst|$dst, cl}", + [(set GR32:$dst, (sra GR32:$src1, CL))], + IIC_SR>, OpSize32; +def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1), + "sar{q}\t{%cl, $dst|$dst, cl}", + [(set GR64:$dst, (sra GR64:$src1, CL))], + IIC_SR>; +} + +def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), + "sar{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))], + IIC_SR>; +def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), + "sar{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))], + IIC_SR>, OpSize16; +def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), + "sar{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))], + IIC_SR>, OpSize32; +def SAR64ri : RIi8<0xC1, MRM7r, (outs GR64:$dst), + (ins GR64:$src1, u8imm:$src2), + "sar{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))], + IIC_SR>; + +// Shift by 1 +def SAR8r1 : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1), + "sar{b}\t$dst", + [(set GR8:$dst, (sra GR8:$src1, (i8 1)))], + IIC_SR>; +def SAR16r1 : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1), + "sar{w}\t$dst", + [(set GR16:$dst, (sra GR16:$src1, (i8 1)))], + IIC_SR>, OpSize16; +def SAR32r1 : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1), + "sar{l}\t$dst", + [(set GR32:$dst, (sra GR32:$src1, (i8 1)))], + IIC_SR>, OpSize32; +def SAR64r1 : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1), + "sar{q}\t$dst", + [(set GR64:$dst, (sra GR64:$src1, (i8 1)))], + IIC_SR>; +} // Constraints = "$src = $dst", SchedRW + + +let SchedRW = [WriteShiftLd, WriteRMW] in { +let Uses = [CL] in { +def SAR8mCL : I<0xD2, MRM7m, (outs), (ins i8mem :$dst), + "sar{b}\t{%cl, $dst|$dst, cl}", + [(store (sra (loadi8 addr:$dst), CL), addr:$dst)], + IIC_SR>; +def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst), + "sar{w}\t{%cl, $dst|$dst, cl}", + [(store (sra (loadi16 addr:$dst), CL), addr:$dst)], + IIC_SR>, OpSize16; +def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), + "sar{l}\t{%cl, $dst|$dst, cl}", + [(store (sra (loadi32 addr:$dst), CL), addr:$dst)], + IIC_SR>, OpSize32; +def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), + "sar{q}\t{%cl, $dst|$dst, cl}", + [(store (sra (loadi64 addr:$dst), CL), addr:$dst)], + IIC_SR>; +} +def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, u8imm:$src), + "sar{b}\t{$src, $dst|$dst, $src}", + [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; +def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, u8imm:$src), + "sar{w}\t{$src, $dst|$dst, $src}", + [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>, OpSize16; +def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, u8imm:$src), + "sar{l}\t{$src, $dst|$dst, $src}", + [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>, OpSize32; +def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, u8imm:$src), + "sar{q}\t{$src, $dst|$dst, $src}", + [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; + +// Shift by 1 +def SAR8m1 : I<0xD0, MRM7m, (outs), (ins i8mem :$dst), + "sar{b}\t$dst", + [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +def SAR16m1 : I<0xD1, MRM7m, (outs), (ins i16mem:$dst), + "sar{w}\t$dst", + [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>, OpSize16; +def SAR32m1 : I<0xD1, MRM7m, (outs), (ins i32mem:$dst), + "sar{l}\t$dst", + [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>, OpSize32; +def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst), + "sar{q}\t$dst", + [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Rotate instructions +//===----------------------------------------------------------------------===// + +let hasSideEffects = 0 in { +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { +def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1), + "rcl{b}\t$dst", [], IIC_SR>; +def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt), + "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; +let Uses = [CL] in +def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1), + "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; + +def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1), + "rcl{w}\t$dst", [], IIC_SR>, OpSize16; +def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt), + "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16; +let Uses = [CL] in +def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1), + "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16; + +def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1), + "rcl{l}\t$dst", [], IIC_SR>, OpSize32; +def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt), + "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; +let Uses = [CL] in +def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1), + "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32; + + +def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1), + "rcl{q}\t$dst", [], IIC_SR>; +def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt), + "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; +let Uses = [CL] in +def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1), + "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; + + +def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1), + "rcr{b}\t$dst", [], IIC_SR>; +def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt), + "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; +let Uses = [CL] in +def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1), + "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; + +def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1), + "rcr{w}\t$dst", [], IIC_SR>, OpSize16; +def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt), + "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16; +let Uses = [CL] in +def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1), + "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16; + +def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1), + "rcr{l}\t$dst", [], IIC_SR>, OpSize32; +def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt), + "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; +let Uses = [CL] in +def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1), + "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32; + +def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1), + "rcr{q}\t$dst", [], IIC_SR>; +def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt), + "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; +let Uses = [CL] in +def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1), + "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; + +} // Constraints = "$src = $dst" + +let SchedRW = [WriteShiftLd, WriteRMW] in { +def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst), + "rcl{b}\t$dst", [], IIC_SR>; +def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, u8imm:$cnt), + "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; +def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst), + "rcl{w}\t$dst", [], IIC_SR>, OpSize16; +def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, u8imm:$cnt), + "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16; +def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst), + "rcl{l}\t$dst", [], IIC_SR>, OpSize32; +def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, u8imm:$cnt), + "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; +def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst), + "rcl{q}\t$dst", [], IIC_SR>; +def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, u8imm:$cnt), + "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; + +def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst), + "rcr{b}\t$dst", [], IIC_SR>; +def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, u8imm:$cnt), + "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; +def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst), + "rcr{w}\t$dst", [], IIC_SR>, OpSize16; +def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, u8imm:$cnt), + "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16; +def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst), + "rcr{l}\t$dst", [], IIC_SR>, OpSize32; +def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, u8imm:$cnt), + "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; +def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst), + "rcr{q}\t$dst", [], IIC_SR>; +def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, u8imm:$cnt), + "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; + +let Uses = [CL] in { +def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst), + "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; +def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst), + "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16; +def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst), + "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32; +def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst), + "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; + +def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst), + "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; +def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst), + "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16; +def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst), + "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32; +def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst), + "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; +} +} // SchedRW +} // hasSideEffects = 0 + +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { +// FIXME: provide shorter instructions when imm8 == 1 +let Uses = [CL] in { +def ROL8rCL : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), + "rol{b}\t{%cl, $dst|$dst, cl}", + [(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>; +def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1), + "rol{w}\t{%cl, $dst|$dst, cl}", + [(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize16; +def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1), + "rol{l}\t{%cl, $dst|$dst, cl}", + [(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>, OpSize32; +def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1), + "rol{q}\t{%cl, $dst|$dst, cl}", + [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>; +} + +def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), + "rol{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))], IIC_SR>; +def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), + "rol{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))], + IIC_SR>, OpSize16; +def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), + "rol{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))], + IIC_SR>, OpSize32; +def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst), + (ins GR64:$src1, u8imm:$src2), + "rol{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))], + IIC_SR>; + +// Rotate by 1 +def ROL8r1 : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), + "rol{b}\t$dst", + [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))], + IIC_SR>; +def ROL16r1 : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1), + "rol{w}\t$dst", + [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))], + IIC_SR>, OpSize16; +def ROL32r1 : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1), + "rol{l}\t$dst", + [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))], + IIC_SR>, OpSize32; +def ROL64r1 : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1), + "rol{q}\t$dst", + [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))], + IIC_SR>; +} // Constraints = "$src = $dst", SchedRW + +let SchedRW = [WriteShiftLd, WriteRMW] in { +let Uses = [CL] in { +def ROL8mCL : I<0xD2, MRM0m, (outs), (ins i8mem :$dst), + "rol{b}\t{%cl, $dst|$dst, cl}", + [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)], + IIC_SR>; +def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst), + "rol{w}\t{%cl, $dst|$dst, cl}", + [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)], + IIC_SR>, OpSize16; +def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst), + "rol{l}\t{%cl, $dst|$dst, cl}", + [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)], + IIC_SR>, OpSize32; +def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst), + "rol{q}\t{%cl, $dst|$dst, cl}", + [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)], + IIC_SR>; +} +def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, u8imm:$src1), + "rol{b}\t{$src1, $dst|$dst, $src1}", + [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)], + IIC_SR>; +def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, u8imm:$src1), + "rol{w}\t{$src1, $dst|$dst, $src1}", + [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)], + IIC_SR>, OpSize16; +def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, u8imm:$src1), + "rol{l}\t{$src1, $dst|$dst, $src1}", + [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)], + IIC_SR>, OpSize32; +def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, u8imm:$src1), + "rol{q}\t{$src1, $dst|$dst, $src1}", + [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)], + IIC_SR>; + +// Rotate by 1 +def ROL8m1 : I<0xD0, MRM0m, (outs), (ins i8mem :$dst), + "rol{b}\t$dst", + [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +def ROL16m1 : I<0xD1, MRM0m, (outs), (ins i16mem:$dst), + "rol{w}\t$dst", + [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>, OpSize16; +def ROL32m1 : I<0xD1, MRM0m, (outs), (ins i32mem:$dst), + "rol{l}\t$dst", + [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>, OpSize32; +def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst), + "rol{q}\t$dst", + [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +} // SchedRW + +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { +let Uses = [CL] in { +def ROR8rCL : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), + "ror{b}\t{%cl, $dst|$dst, cl}", + [(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>; +def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1), + "ror{w}\t{%cl, $dst|$dst, cl}", + [(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize16; +def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1), + "ror{l}\t{%cl, $dst|$dst, cl}", + [(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>, OpSize32; +def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1), + "ror{q}\t{%cl, $dst|$dst, cl}", + [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>; +} + +def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), + "ror{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))], IIC_SR>; +def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), + "ror{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))], + IIC_SR>, OpSize16; +def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), + "ror{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))], + IIC_SR>, OpSize32; +def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst), + (ins GR64:$src1, u8imm:$src2), + "ror{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))], + IIC_SR>; + +// Rotate by 1 +def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), + "ror{b}\t$dst", + [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))], + IIC_SR>; +def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1), + "ror{w}\t$dst", + [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))], + IIC_SR>, OpSize16; +def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1), + "ror{l}\t$dst", + [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))], + IIC_SR>, OpSize32; +def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1), + "ror{q}\t$dst", + [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))], + IIC_SR>; +} // Constraints = "$src = $dst", SchedRW + +let SchedRW = [WriteShiftLd, WriteRMW] in { +let Uses = [CL] in { +def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst), + "ror{b}\t{%cl, $dst|$dst, cl}", + [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)], + IIC_SR>; +def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst), + "ror{w}\t{%cl, $dst|$dst, cl}", + [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)], + IIC_SR>, OpSize16; +def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), + "ror{l}\t{%cl, $dst|$dst, cl}", + [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)], + IIC_SR>, OpSize32; +def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), + "ror{q}\t{%cl, $dst|$dst, cl}", + [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)], + IIC_SR>; +} +def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, u8imm:$src), + "ror{b}\t{$src, $dst|$dst, $src}", + [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; +def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, u8imm:$src), + "ror{w}\t{$src, $dst|$dst, $src}", + [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>, OpSize16; +def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, u8imm:$src), + "ror{l}\t{$src, $dst|$dst, $src}", + [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>, OpSize32; +def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src), + "ror{q}\t{$src, $dst|$dst, $src}", + [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], + IIC_SR>; + +// Rotate by 1 +def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst), + "ror{b}\t$dst", + [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +def ROR16m1 : I<0xD1, MRM1m, (outs), (ins i16mem:$dst), + "ror{w}\t$dst", + [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>, OpSize16; +def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst), + "ror{l}\t$dst", + [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>, OpSize32; +def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst), + "ror{q}\t$dst", + [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)], + IIC_SR>; +} // SchedRW + + +//===----------------------------------------------------------------------===// +// Double shift instructions (generalizations of rotate) +//===----------------------------------------------------------------------===// + +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { + +let Uses = [CL] in { +def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), + (ins GR16:$src1, GR16:$src2), + "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))], + IIC_SHD16_REG_CL>, + TB, OpSize16; +def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), + (ins GR16:$src1, GR16:$src2), + "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))], + IIC_SHD16_REG_CL>, + TB, OpSize16; +def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2), + "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))], + IIC_SHD32_REG_CL>, TB, OpSize32; +def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2), + "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))], + IIC_SHD32_REG_CL>, TB, OpSize32; +def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2), + "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))], + IIC_SHD64_REG_CL>, + TB; +def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2), + "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))], + IIC_SHD64_REG_CL>, + TB; +} + +let isCommutable = 1 in { // These instructions commute to each other. +def SHLD16rri8 : Ii8<0xA4, MRMDestReg, + (outs GR16:$dst), + (ins GR16:$src1, GR16:$src2, u8imm:$src3), + "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, + (i8 imm:$src3)))], IIC_SHD16_REG_IM>, + TB, OpSize16; +def SHRD16rri8 : Ii8<0xAC, MRMDestReg, + (outs GR16:$dst), + (ins GR16:$src1, GR16:$src2, u8imm:$src3), + "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, + (i8 imm:$src3)))], IIC_SHD16_REG_IM>, + TB, OpSize16; +def SHLD32rri8 : Ii8<0xA4, MRMDestReg, + (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2, u8imm:$src3), + "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, + (i8 imm:$src3)))], IIC_SHD32_REG_IM>, + TB, OpSize32; +def SHRD32rri8 : Ii8<0xAC, MRMDestReg, + (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2, u8imm:$src3), + "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, + (i8 imm:$src3)))], IIC_SHD32_REG_IM>, + TB, OpSize32; +def SHLD64rri8 : RIi8<0xA4, MRMDestReg, + (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2, u8imm:$src3), + "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, + (i8 imm:$src3)))], IIC_SHD64_REG_IM>, + TB; +def SHRD64rri8 : RIi8<0xAC, MRMDestReg, + (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2, u8imm:$src3), + "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, + (i8 imm:$src3)))], IIC_SHD64_REG_IM>, + TB; +} +} // Constraints = "$src = $dst", SchedRW + +let SchedRW = [WriteShiftLd, WriteRMW] in { +let Uses = [CL] in { +def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), + "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL), + addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize16; +def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), + "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL), + addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize16; + +def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), + "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL), + addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32; +def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), + "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL), + addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32; + +def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), + "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL), + addr:$dst)], IIC_SHD64_MEM_CL>, TB; +def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), + "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL), + addr:$dst)], IIC_SHD64_MEM_CL>, TB; +} + +def SHLD16mri8 : Ii8<0xA4, MRMDestMem, + (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3), + "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shld (loadi16 addr:$dst), GR16:$src2, + (i8 imm:$src3)), addr:$dst)], + IIC_SHD16_MEM_IM>, + TB, OpSize16; +def SHRD16mri8 : Ii8<0xAC, MRMDestMem, + (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3), + "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, + (i8 imm:$src3)), addr:$dst)], + IIC_SHD16_MEM_IM>, + TB, OpSize16; + +def SHLD32mri8 : Ii8<0xA4, MRMDestMem, + (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3), + "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shld (loadi32 addr:$dst), GR32:$src2, + (i8 imm:$src3)), addr:$dst)], + IIC_SHD32_MEM_IM>, + TB, OpSize32; +def SHRD32mri8 : Ii8<0xAC, MRMDestMem, + (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3), + "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, + (i8 imm:$src3)), addr:$dst)], + IIC_SHD32_MEM_IM>, + TB, OpSize32; + +def SHLD64mri8 : RIi8<0xA4, MRMDestMem, + (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3), + "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shld (loadi64 addr:$dst), GR64:$src2, + (i8 imm:$src3)), addr:$dst)], + IIC_SHD64_MEM_IM>, + TB; +def SHRD64mri8 : RIi8<0xAC, MRMDestMem, + (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3), + "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, + (i8 imm:$src3)), addr:$dst)], + IIC_SHD64_MEM_IM>, + TB; +} // SchedRW + +} // Defs = [EFLAGS] + +def ROT32L2R_imm8 : SDNodeXForm<imm, [{ + // Convert a ROTL shamt to a ROTR shamt on 32-bit integer. + return getI8Imm(32 - N->getZExtValue(), SDLoc(N)); +}]>; + +def ROT64L2R_imm8 : SDNodeXForm<imm, [{ + // Convert a ROTL shamt to a ROTR shamt on 64-bit integer. + return getI8Imm(64 - N->getZExtValue(), SDLoc(N)); +}]>; + +multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> { +let hasSideEffects = 0 in { + def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, TAXD, VEX, Sched<[WriteShift]>; + let mayLoad = 1 in + def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst), + (ins x86memop:$src1, u8imm:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, TAXD, VEX, Sched<[WriteShiftLd]>; +} +} + +multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> { +let hasSideEffects = 0 in { + def rr : I<0xF7, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, + VEX_4VOp3, Sched<[WriteShift]>; + let mayLoad = 1 in + def rm : I<0xF7, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, + VEX_4VOp3, + Sched<[WriteShiftLd, + // x86memop:$src1 + ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadDefault, + // RC:$src1 + ReadAfterLd]>; +} +} + +let Predicates = [HasBMI2] in { + defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem>; + defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem>, VEX_W; + defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem>, T8XS; + defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem>, T8XS, VEX_W; + defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem>, T8XD; + defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8XD, VEX_W; + defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8PD; + defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8PD, VEX_W; + + // Prefer RORX which is non-destructive and doesn't update EFLAGS. + let AddedComplexity = 10 in { + def : Pat<(rotl GR32:$src, (i8 imm:$shamt)), + (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>; + def : Pat<(rotl GR64:$src, (i8 imm:$shamt)), + (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>; + } + + def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)), + (RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>; + def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)), + (RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>; + + // Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not + // immedidate shift, i.e. the following code is considered better + // + // mov %edi, %esi + // shl $imm, %esi + // ... %edi, ... + // + // than + // + // movb $imm, %sil + // shlx %sil, %edi, %esi + // ... %edi, ... + // + let AddedComplexity = 1 in { + def : Pat<(sra GR32:$src1, GR8:$src2), + (SARX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(sra GR64:$src1, GR8:$src2), + (SARX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(srl GR32:$src1, GR8:$src2), + (SHRX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(srl GR64:$src1, GR8:$src2), + (SHRX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(shl GR32:$src1, GR8:$src2), + (SHLX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(shl GR64:$src1, GR8:$src2), + (SHLX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + } + + // Patterns on SARXrm/SHRXrm/SHLXrm are explicitly omitted to favor + // + // mov (%ecx), %esi + // shl $imm, $esi + // + // over + // + // movb $imm %al + // shlx %al, (%ecx), %esi + // + // As SARXrr/SHRXrr/SHLXrr is favored on variable shift, the peephole + // optimization will fold them into SARXrm/SHRXrm/SHLXrm if possible. +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td new file mode 100644 index 0000000..a97d1e5 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td @@ -0,0 +1,615 @@ +//===-- X86InstrSystem.td - System Instructions ------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 instructions that are generally used in +// privileged modes. These are not typically used by the compiler, but are +// supported for the assembler and disassembler. +// +//===----------------------------------------------------------------------===// + +let SchedRW = [WriteSystem] in { +let Defs = [RAX, RDX] in + def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)], IIC_RDTSC>, + TB; + +let Defs = [RAX, RCX, RDX] in + def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)]>, TB; + +// CPU flow control instructions + +let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in { + def TRAP : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB; + def UD2B : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB; +} + +def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", [], IIC_HLT>; +def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", [], IIC_RSM>, TB; + +// Interrupt and SysCall Instructions. +let Uses = [EFLAGS] in + def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>; +def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", + [(int_x86_int (i8 3))], IIC_INT3>; +} // SchedRW + +// The long form of "int $3" turns into int3 as a size optimization. +// FIXME: This doesn't work because InstAlias can't match immediate constants. +//def : InstAlias<"int\t$3", (INT3)>; + +let SchedRW = [WriteSystem] in { + +def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap", + [(int_x86_int imm:$trap)], IIC_INT>; + + +def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", [], IIC_SYSCALL>, TB; +def SYSRET : I<0x07, RawFrm, (outs), (ins), "sysret{l}", [], IIC_SYSCALL>, TB; +def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", [], IIC_SYSCALL>, TB, + Requires<[In64BitMode]>; + +def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", [], + IIC_SYS_ENTER_EXIT>, TB; + +def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [], + IIC_SYS_ENTER_EXIT>, TB; +def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", [], + IIC_SYS_ENTER_EXIT>, TB, Requires<[In64BitMode]>; +} // SchedRW + +def : Pat<(debugtrap), + (INT3)>, Requires<[NotPS4]>; +def : Pat<(debugtrap), + (INT (i8 0x41))>, Requires<[IsPS4]>; + +//===----------------------------------------------------------------------===// +// Input/Output Instructions. +// +let SchedRW = [WriteSystem] in { +let Defs = [AL], Uses = [DX] in +def IN8rr : I<0xEC, RawFrm, (outs), (ins), + "in{b}\t{%dx, %al|al, dx}", [], IIC_IN_RR>; +let Defs = [AX], Uses = [DX] in +def IN16rr : I<0xED, RawFrm, (outs), (ins), + "in{w}\t{%dx, %ax|ax, dx}", [], IIC_IN_RR>, OpSize16; +let Defs = [EAX], Uses = [DX] in +def IN32rr : I<0xED, RawFrm, (outs), (ins), + "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>, OpSize32; + +let Defs = [AL] in +def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins u8imm:$port), + "in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>; +let Defs = [AX] in +def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port), + "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize16; +let Defs = [EAX] in +def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port), + "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>, OpSize32; + +let Uses = [DX, AL] in +def OUT8rr : I<0xEE, RawFrm, (outs), (ins), + "out{b}\t{%al, %dx|dx, al}", [], IIC_OUT_RR>; +let Uses = [DX, AX] in +def OUT16rr : I<0xEF, RawFrm, (outs), (ins), + "out{w}\t{%ax, %dx|dx, ax}", [], IIC_OUT_RR>, OpSize16; +let Uses = [DX, EAX] in +def OUT32rr : I<0xEF, RawFrm, (outs), (ins), + "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>, OpSize32; + +let Uses = [AL] in +def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins u8imm:$port), + "out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>; +let Uses = [AX] in +def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port), + "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize16; +let Uses = [EAX] in +def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port), + "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>, OpSize32; + +} // SchedRW + +//===----------------------------------------------------------------------===// +// Moves to and from debug registers + +let SchedRW = [WriteSystem] in { +def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB, + Requires<[Not64BitMode]>; +def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB, + Requires<[In64BitMode]>; + +def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB, + Requires<[Not64BitMode]>; +def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB, + Requires<[In64BitMode]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Moves to and from control registers + +let SchedRW = [WriteSystem] in { +def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB, + Requires<[Not64BitMode]>; +def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB, + Requires<[In64BitMode]>; + +def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB, + Requires<[Not64BitMode]>; +def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB, + Requires<[In64BitMode]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Segment override instruction prefixes + +def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", []>; +def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", []>; +def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", []>; +def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", []>; +def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", []>; +def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>; + + +//===----------------------------------------------------------------------===// +// Moves to and from segment registers. +// + +let SchedRW = [WriteMove] in { +def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src), + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize16; +def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize32; +def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>; + +def MOV16ms : I<0x8C, MRMDestMem, (outs i16mem:$dst), (ins SEGMENT_REG:$src), + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize16; +def MOV32ms : I<0x8C, MRMDestMem, (outs i32mem:$dst), (ins SEGMENT_REG:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize32; +def MOV64ms : RI<0x8C, MRMDestMem, (outs i64mem:$dst), (ins SEGMENT_REG:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>; + +def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src), + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize16; +def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize32; +def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>; + +def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src), + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize16; +def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src), + "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize32; +def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src), + "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Segmentation support instructions. + +let SchedRW = [WriteSystem] in { +def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB; + +def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB, + OpSize16; +def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB, + OpSize16; + +// i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo. +def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), + "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB, + OpSize32; +def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB, + OpSize32; +// i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo. +def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), + "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB; +def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src), + "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB; + +def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB, + OpSize16; +def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB, + OpSize16; +def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB, + OpSize32; +def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB, + OpSize32; +def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; +def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB; + +def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", + [], IIC_INVLPG>, TB; + +def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins), + "str{w}\t$dst", [], IIC_STR>, TB, OpSize16; +def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins), + "str{l}\t$dst", [], IIC_STR>, TB, OpSize32; +def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins), + "str{q}\t$dst", [], IIC_STR>, TB; +def STRm : I<0x00, MRM1m, (outs i16mem:$dst), (ins), + "str{w}\t$dst", [], IIC_STR>, TB; + +def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src), + "ltr{w}\t$src", [], IIC_LTR>, TB; +def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src), + "ltr{w}\t$src", [], IIC_LTR>, TB; + +def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins), + "push{w}\t{%cs|cs}", [], IIC_PUSH_SR>, + OpSize16, Requires<[Not64BitMode]>; +def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins), + "push{l}\t{%cs|cs}", [], IIC_PUSH_CS>, + OpSize32, Requires<[Not64BitMode]>; +def PUSHSS16 : I<0x16, RawFrm, (outs), (ins), + "push{w}\t{%ss|ss}", [], IIC_PUSH_SR>, + OpSize16, Requires<[Not64BitMode]>; +def PUSHSS32 : I<0x16, RawFrm, (outs), (ins), + "push{l}\t{%ss|ss}", [], IIC_PUSH_SR>, + OpSize32, Requires<[Not64BitMode]>; +def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins), + "push{w}\t{%ds|ds}", [], IIC_PUSH_SR>, + OpSize16, Requires<[Not64BitMode]>; +def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins), + "push{l}\t{%ds|ds}", [], IIC_PUSH_SR>, + OpSize32, Requires<[Not64BitMode]>; +def PUSHES16 : I<0x06, RawFrm, (outs), (ins), + "push{w}\t{%es|es}", [], IIC_PUSH_SR>, + OpSize16, Requires<[Not64BitMode]>; +def PUSHES32 : I<0x06, RawFrm, (outs), (ins), + "push{l}\t{%es|es}", [], IIC_PUSH_SR>, + OpSize32, Requires<[Not64BitMode]>; +def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins), + "push{w}\t{%fs|fs}", [], IIC_PUSH_SR>, OpSize16, TB; +def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins), + "push{l}\t{%fs|fs}", [], IIC_PUSH_SR>, TB, + OpSize32, Requires<[Not64BitMode]>; +def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins), + "push{w}\t{%gs|gs}", [], IIC_PUSH_SR>, OpSize16, TB; +def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins), + "push{l}\t{%gs|gs}", [], IIC_PUSH_SR>, TB, + OpSize32, Requires<[Not64BitMode]>; +def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins), + "push{q}\t{%fs|fs}", [], IIC_PUSH_SR>, TB, + OpSize32, Requires<[In64BitMode]>; +def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins), + "push{q}\t{%gs|gs}", [], IIC_PUSH_SR>, TB, + OpSize32, Requires<[In64BitMode]>; + +// No "pop cs" instruction. +def POPSS16 : I<0x17, RawFrm, (outs), (ins), + "pop{w}\t{%ss|ss}", [], IIC_POP_SR_SS>, + OpSize16, Requires<[Not64BitMode]>; +def POPSS32 : I<0x17, RawFrm, (outs), (ins), + "pop{l}\t{%ss|ss}", [], IIC_POP_SR_SS>, + OpSize32, Requires<[Not64BitMode]>; + +def POPDS16 : I<0x1F, RawFrm, (outs), (ins), + "pop{w}\t{%ds|ds}", [], IIC_POP_SR>, + OpSize16, Requires<[Not64BitMode]>; +def POPDS32 : I<0x1F, RawFrm, (outs), (ins), + "pop{l}\t{%ds|ds}", [], IIC_POP_SR>, + OpSize32, Requires<[Not64BitMode]>; + +def POPES16 : I<0x07, RawFrm, (outs), (ins), + "pop{w}\t{%es|es}", [], IIC_POP_SR>, + OpSize16, Requires<[Not64BitMode]>; +def POPES32 : I<0x07, RawFrm, (outs), (ins), + "pop{l}\t{%es|es}", [], IIC_POP_SR>, + OpSize32, Requires<[Not64BitMode]>; + +def POPFS16 : I<0xa1, RawFrm, (outs), (ins), + "pop{w}\t{%fs|fs}", [], IIC_POP_SR>, OpSize16, TB; +def POPFS32 : I<0xa1, RawFrm, (outs), (ins), + "pop{l}\t{%fs|fs}", [], IIC_POP_SR>, TB, + OpSize32, Requires<[Not64BitMode]>; +def POPFS64 : I<0xa1, RawFrm, (outs), (ins), + "pop{q}\t{%fs|fs}", [], IIC_POP_SR>, TB, + OpSize32, Requires<[In64BitMode]>; + +def POPGS16 : I<0xa9, RawFrm, (outs), (ins), + "pop{w}\t{%gs|gs}", [], IIC_POP_SR>, OpSize16, TB; +def POPGS32 : I<0xa9, RawFrm, (outs), (ins), + "pop{l}\t{%gs|gs}", [], IIC_POP_SR>, TB, + OpSize32, Requires<[Not64BitMode]>; +def POPGS64 : I<0xa9, RawFrm, (outs), (ins), + "pop{q}\t{%gs|gs}", [], IIC_POP_SR>, TB, + OpSize32, Requires<[In64BitMode]>; + + +def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), + "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16; +def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), + "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32; + +def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), + "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16; +def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), + "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32; +def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src), + "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; + +def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), + "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16; +def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), + "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32; + +def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), + "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16; +def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), + "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32; +def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src), + "lfs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; + +def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), + "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16; +def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), + "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32; + +def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src), + "lgs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; + + +def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg), + "verr\t$seg", [], IIC_VERR>, TB; +def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg), + "verr\t$seg", [], IIC_VERR>, TB; +def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg), + "verw\t$seg", [], IIC_VERW_MEM>, TB; +def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg), + "verw\t$seg", [], IIC_VERW_REG>, TB; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Descriptor-table support instructions + +let SchedRW = [WriteSystem] in { +def SGDT16m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins), + "sgdt{w}\t$dst", [], IIC_SGDT>, TB, OpSize16, Requires<[Not64BitMode]>; +def SGDT32m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins), + "sgdt{l}\t$dst", [], IIC_SGDT>, OpSize32, TB, Requires <[Not64BitMode]>; +def SGDT64m : I<0x01, MRM0m, (outs opaque80mem:$dst), (ins), + "sgdt{q}\t$dst", [], IIC_SGDT>, TB, Requires <[In64BitMode]>; +def SIDT16m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins), + "sidt{w}\t$dst", [], IIC_SIDT>, TB, OpSize16, Requires<[Not64BitMode]>; +def SIDT32m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins), + "sidt{l}\t$dst", []>, OpSize32, TB, Requires <[Not64BitMode]>; +def SIDT64m : I<0x01, MRM1m, (outs opaque80mem:$dst), (ins), + "sidt{q}\t$dst", []>, TB, Requires <[In64BitMode]>; +def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins), + "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize16; +def SLDT16m : I<0x00, MRM0m, (outs i16mem:$dst), (ins), + "sldt{w}\t$dst", [], IIC_SLDT>, TB; +def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins), + "sldt{l}\t$dst", [], IIC_SLDT>, OpSize32, TB; + +// LLDT is not interpreted specially in 64-bit mode because there is no sign +// extension. +def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins), + "sldt{q}\t$dst", [], IIC_SLDT>, TB; +def SLDT64m : RI<0x00, MRM0m, (outs i16mem:$dst), (ins), + "sldt{q}\t$dst", [], IIC_SLDT>, TB; + +def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src), + "lgdt{w}\t$src", [], IIC_LGDT>, TB, OpSize16, Requires<[Not64BitMode]>; +def LGDT32m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src), + "lgdt{l}\t$src", [], IIC_LGDT>, OpSize32, TB, Requires<[Not64BitMode]>; +def LGDT64m : I<0x01, MRM2m, (outs), (ins opaque80mem:$src), + "lgdt{q}\t$src", [], IIC_LGDT>, TB, Requires<[In64BitMode]>; +def LIDT16m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src), + "lidt{w}\t$src", [], IIC_LIDT>, TB, OpSize16, Requires<[Not64BitMode]>; +def LIDT32m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src), + "lidt{l}\t$src", [], IIC_LIDT>, OpSize32, TB, Requires<[Not64BitMode]>; +def LIDT64m : I<0x01, MRM3m, (outs), (ins opaque80mem:$src), + "lidt{q}\t$src", [], IIC_LIDT>, TB, Requires<[In64BitMode]>; +def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src), + "lldt{w}\t$src", [], IIC_LLDT_REG>, TB; +def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src), + "lldt{w}\t$src", [], IIC_LLDT_MEM>, TB; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Specialized register support +let SchedRW = [WriteSystem] in { +let Uses = [EAX, ECX, EDX] in +def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB; +let Defs = [EAX, EDX], Uses = [ECX] in +def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB; + +let Defs = [RAX, RDX], Uses = [ECX] in + def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)], IIC_RDPMC>, + TB; + +def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), + "smsw{w}\t$dst", [], IIC_SMSW>, OpSize16, TB; +def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins), + "smsw{l}\t$dst", [], IIC_SMSW>, OpSize32, TB; +// no m form encodable; use SMSW16m +def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins), + "smsw{q}\t$dst", [], IIC_SMSW>, TB; + +// For memory operands, there is only a 16-bit form +def SMSW16m : I<0x01, MRM4m, (outs i16mem:$dst), (ins), + "smsw{w}\t$dst", [], IIC_SMSW>, TB; + +def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src), + "lmsw{w}\t$src", [], IIC_LMSW_MEM>, TB; +def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src), + "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB; + +let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in + def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Cache instructions +let SchedRW = [WriteSystem] in { +def INVD : I<0x08, RawFrm, (outs), (ins), "invd", [], IIC_INVD>, TB; +def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB; +} // SchedRW + +//===----------------------------------------------------------------------===// +// XSAVE instructions +let SchedRW = [WriteSystem] in { +let Predicates = [HasXSAVE] in { +let Defs = [EDX, EAX], Uses = [ECX] in + def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB; + +let Uses = [EDX, EAX, ECX] in + def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB; +} + +let Uses = [EDX, EAX] in { +let Predicates = [HasXSAVE] in { + def XSAVE : I<0xAE, MRM4m, (outs), (ins opaque512mem:$dst), + "xsave\t$dst", + [(int_x86_xsave addr:$dst, EDX, EAX)]>, TB; + def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaque512mem:$dst), + "xsave64\t$dst", + [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; + def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), + "xrstor\t$dst", + [(int_x86_xrstor addr:$dst, EDX, EAX)]>, TB; + def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), + "xrstor64\t$dst", + [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; +} +let Predicates = [HasXSAVEOPT] in { + def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaque512mem:$dst), + "xsaveopt\t$dst", + [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, PS; + def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaque512mem:$dst), + "xsaveopt64\t$dst", + [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[In64BitMode]>; +} +let Predicates = [HasXSAVEC] in { + def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaque512mem:$dst), + "xsavec\t$dst", + [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB; + def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaque512mem:$dst), + "xsavec64\t$dst", + [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; +} +let Predicates = [HasXSAVES] in { + def XSAVES : I<0xC7, MRM5m, (outs), (ins opaque512mem:$dst), + "xsaves\t$dst", + [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB; + def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaque512mem:$dst), + "xsaves64\t$dst", + [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; + def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaque512mem:$dst), + "xrstors\t$dst", + [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB; + def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaque512mem:$dst), + "xrstors64\t$dst", + [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; +} +} // Uses +} // SchedRW + +//===----------------------------------------------------------------------===// +// VIA PadLock crypto instructions +let Defs = [RAX, RDI], Uses = [RDX, RDI] in + def XSTORE : I<0xa7, MRM_C0, (outs), (ins), "xstore", []>, TB; + +def : InstAlias<"xstorerng", (XSTORE)>; + +let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in { + def XCRYPTECB : I<0xa7, MRM_C8, (outs), (ins), "xcryptecb", []>, TB; + def XCRYPTCBC : I<0xa7, MRM_D0, (outs), (ins), "xcryptcbc", []>, TB; + def XCRYPTCTR : I<0xa7, MRM_D8, (outs), (ins), "xcryptctr", []>, TB; + def XCRYPTCFB : I<0xa7, MRM_E0, (outs), (ins), "xcryptcfb", []>, TB; + def XCRYPTOFB : I<0xa7, MRM_E8, (outs), (ins), "xcryptofb", []>, TB; +} + +let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in { + def XSHA1 : I<0xa6, MRM_C8, (outs), (ins), "xsha1", []>, TB; + def XSHA256 : I<0xa6, MRM_D0, (outs), (ins), "xsha256", []>, TB; +} +let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in + def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB; +//==-----------------------------------------------------------------------===// +// PKU - enable protection key +let usesCustomInserter = 1 in { + def WRPKRU : PseudoI<(outs), (ins GR32:$src), + [(int_x86_wrpkru GR32:$src)]>; + def RDPKRU : PseudoI<(outs GR32:$dst), (ins), + [(set GR32:$dst, (int_x86_rdpkru))]>; +} + +let Defs = [EAX, EDX], Uses = [ECX] in + def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB; +let Uses = [EAX, ECX, EDX] in + def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB; + +//===----------------------------------------------------------------------===// +// FS/GS Base Instructions +let Predicates = [HasFSGSBase, In64BitMode] in { + def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins), + "rdfsbase{l}\t$dst", + [(set GR32:$dst, (int_x86_rdfsbase_32))]>, XS; + def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins), + "rdfsbase{q}\t$dst", + [(set GR64:$dst, (int_x86_rdfsbase_64))]>, XS; + def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins), + "rdgsbase{l}\t$dst", + [(set GR32:$dst, (int_x86_rdgsbase_32))]>, XS; + def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins), + "rdgsbase{q}\t$dst", + [(set GR64:$dst, (int_x86_rdgsbase_64))]>, XS; + def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$src), + "wrfsbase{l}\t$src", + [(int_x86_wrfsbase_32 GR32:$src)]>, XS; + def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$src), + "wrfsbase{q}\t$src", + [(int_x86_wrfsbase_64 GR64:$src)]>, XS; + def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$src), + "wrgsbase{l}\t$src", + [(int_x86_wrgsbase_32 GR32:$src)]>, XS; + def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$src), + "wrgsbase{q}\t$src", + [(int_x86_wrgsbase_64 GR64:$src)]>, XS; +} + +//===----------------------------------------------------------------------===// +// INVPCID Instruction +def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), + "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD, + Requires<[Not64BitMode]>; +def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), + "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD, + Requires<[In64BitMode]>; + +//===----------------------------------------------------------------------===// +// SMAP Instruction +let Defs = [EFLAGS] in { + def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB; + def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB; +} + +//===----------------------------------------------------------------------===// +// SMX Instruction +let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in { + def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB; +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrTSX.td b/contrib/llvm/lib/Target/X86/X86InstrTSX.td new file mode 100644 index 0000000..7267d75 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrTSX.td @@ -0,0 +1,50 @@ +//===-- X86InstrVMX.td - TSX Instruction Set Extension -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the Intel TSX instruction +// set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// TSX instructions + +def X86xtest: SDNode<"X86ISD::XTEST", SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>, + [SDNPHasChain, SDNPSideEffect]>; + +let usesCustomInserter = 1 in +def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins), + "# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>, + Requires<[HasRTM]>; + +let isBranch = 1, isTerminator = 1, Defs = [EAX] in { +def XBEGIN_2 : Ii16PCRel<0xc7, MRM_F8, (outs), (ins brtarget16:$dst), + "xbegin\t$dst", []>, OpSize16, Requires<[HasRTM]>; +def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst), + "xbegin\t$dst", []>, OpSize32, Requires<[HasRTM]>; +} + +def XEND : I<0x01, MRM_D5, (outs), (ins), + "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>; + +let Defs = [EFLAGS] in +def XTEST : I<0x01, MRM_D6, (outs), (ins), + "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasTSX]>; + +def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm), + "xabort\t$imm", + [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>; + +// HLE prefixes + +let isAsmParserOnly = 1 in { +def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>, Requires<[HasHLE]>; +def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>, Requires<[HasHLE]>; +} + diff --git a/contrib/llvm/lib/Target/X86/X86InstrVMX.td b/contrib/llvm/lib/Target/X86/X86InstrVMX.td new file mode 100644 index 0000000..79afe9a --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrVMX.td @@ -0,0 +1,66 @@ +//===-- X86InstrVMX.td - VMX Instruction Set Extension -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the Intel VMX instruction +// set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// VMX instructions + +// 66 0F 38 80 +def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), + "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD, + Requires<[Not64BitMode]>; +def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), + "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD, + Requires<[In64BitMode]>; +// 66 0F 38 81 +def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), + "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD, + Requires<[Not64BitMode]>; +def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), + "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD, + Requires<[In64BitMode]>; +// 0F 01 C1 +def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB; +def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs), + "vmclear\t$vmcs", []>, PD; +// OF 01 D4 +def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB; +// 0F 01 C2 +def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB; +// 0F 01 C3 +def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB; +def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs), + "vmptrld\t$vmcs", []>, PS; +def VMPTRSTm : I<0xC7, MRM7m, (outs i64mem:$vmcs), (ins), + "vmptrst\t$vmcs", []>, TB; +def VMREAD64rm : I<0x78, MRMDestMem, (outs i64mem:$dst), (ins GR64:$src), + "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>; +def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), + "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>; +def VMREAD32rm : I<0x78, MRMDestMem, (outs i32mem:$dst), (ins GR32:$src), + "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>; +def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), + "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>; +def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>; +def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>; +def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>; +def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>; +// 0F 01 C4 +def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB; +def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon), + "vmxon\t$vmxon", []>, XS; + diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td new file mode 100644 index 0000000..4cb2304 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td @@ -0,0 +1,344 @@ +//===-- X86InstrXOP.td - XOP Instruction Set ---------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes XOP (eXtended OPerations) +// +//===----------------------------------------------------------------------===// + +multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { + def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int VR128:$src))]>, XOP; + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP; +} + +let ExeDomain = SSEPackedInt in { + defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>; + defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>; + defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>; + defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>; + defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>; + defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>; + defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>; + defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>; + defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>; + defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>; + defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>; + defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>; + defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>; + defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>; + defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>; +} + +// Scalar load 2 addr operand instructions +multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int, + Operand memop, ComplexPattern mem_cpat> { + def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int VR128:$src))]>, XOP; + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP; +} + +multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int, + PatFrag memop> { + def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int VR128:$src))]>, XOP; + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP; +} + +multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int, + PatFrag memop> { + def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L; + def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L; +} + +let ExeDomain = SSEPackedSingle in { + defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss, + ssmem, sse_load_f32>; + defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32>; + defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32>; +} + +let ExeDomain = SSEPackedDouble in { + defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd, + sdmem, sse_load_f64>; + defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64>; + defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>; +} + +multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType vt128> { + def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2))))]>, + XOP_4VOp3, Sched<[WriteVarVecShift]>; + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), + (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>, + XOP_4V, VEX_W, Sched<[WriteVarVecShift, ReadAfterLd]>; + def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), + (vt128 VR128:$src2))))]>, + XOP_4VOp3, Sched<[WriteVarVecShift, ReadAfterLd]>; +} + +let ExeDomain = SSEPackedInt in { + defm VPROTB : xop3op<0x90, "vprotb", X86vprot, v16i8>; + defm VPROTD : xop3op<0x92, "vprotd", X86vprot, v4i32>; + defm VPROTQ : xop3op<0x93, "vprotq", X86vprot, v2i64>; + defm VPROTW : xop3op<0x91, "vprotw", X86vprot, v8i16>; + defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8>; + defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32>; + defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64>; + defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16>; + defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8>; + defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32>; + defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64>; + defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16>; +} + +multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType vt128> { + def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>, XOP; + def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1, u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>, XOP; +} + +let ExeDomain = SSEPackedInt in { + defm VPROTB : xop3opimm<0xC0, "vprotb", X86vproti, v16i8>; + defm VPROTD : xop3opimm<0xC2, "vprotd", X86vproti, v4i32>; + defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vproti, v2i64>; + defm VPROTW : xop3opimm<0xC1, "vprotw", X86vproti, v8i16>; +} + +// Instruction where second source can be memory, but third must be register +multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> { + let isCommutable = 1 in + def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V, VEX_I8IMM; + def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)), + VR128:$src3))]>, XOP_4V, VEX_I8IMM; +} + +let ExeDomain = SSEPackedInt in { + defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>; + defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>; + defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>; + defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>; + defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>; + defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>; + defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>; + defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>; + defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>; + defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>; + defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>; + defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>; +} + +// Instruction where second source can be memory, third must be imm8 +multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128> { + let isCommutable = 1 in + def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, XOPCC:$cc), + !strconcat("vpcom${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), + i8immZExt3:$cc)))]>, + XOP_4V; + def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, XOPCC:$cc), + !strconcat("vpcom${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), + (vt128 (bitconvert (loadv2i64 addr:$src2))), + i8immZExt3:$cc)))]>, + XOP_4V; + let isAsmParserOnly = 1, hasSideEffects = 0 in { + def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), + !strconcat("vpcom", Suffix, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, XOP_4V; + let mayLoad = 1 in + def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), + !strconcat("vpcom", Suffix, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, XOP_4V; + } +} + +let ExeDomain = SSEPackedInt in { // SSE integer instructions + defm VPCOMB : xopvpcom<0xCC, "b", X86vpcom, v16i8>; + defm VPCOMW : xopvpcom<0xCD, "w", X86vpcom, v8i16>; + defm VPCOMD : xopvpcom<0xCE, "d", X86vpcom, v4i32>; + defm VPCOMQ : xopvpcom<0xCF, "q", X86vpcom, v2i64>; + defm VPCOMUB : xopvpcom<0xEC, "ub", X86vpcomu, v16i8>; + defm VPCOMUW : xopvpcom<0xED, "uw", X86vpcomu, v8i16>; + defm VPCOMUD : xopvpcom<0xEE, "ud", X86vpcomu, v4i32>; + defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64>; +} + +// Instruction where either second or third source can be memory +multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> { + def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, + XOP_4V, VEX_I8IMM; + def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, i128mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, VR128:$src2, + (bitconvert (loadv2i64 addr:$src3))))]>, + XOP_4V, VEX_I8IMM, VEX_W, MemOp4; + def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)), + VR128:$src3))]>, + XOP_4V, VEX_I8IMM; +} + +let ExeDomain = SSEPackedInt in { + defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>; + defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>; +} + +multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> { + def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, VR256:$src3))]>, + XOP_4V, VEX_I8IMM, VEX_L; + def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, i256mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, + (Int VR256:$src1, VR256:$src2, + (bitconvert (loadv4i64 addr:$src3))))]>, + XOP_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L; + def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, + (Int VR256:$src1, (bitconvert (loadv4i64 addr:$src2)), + VR256:$src3))]>, + XOP_4V, VEX_I8IMM, VEX_L; +} + +let ExeDomain = SSEPackedInt in + defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>; + +let Predicates = [HasXOP] in { + def : Pat<(v2i64 (or (and VR128:$src3, VR128:$src1), + (X86andnp VR128:$src3, VR128:$src2))), + (VPCMOVrr VR128:$src1, VR128:$src2, VR128:$src3)>; + + def : Pat<(v4i64 (or (and VR256:$src3, VR256:$src1), + (X86andnp VR256:$src3, VR256:$src2))), + (VPCMOVrrY VR256:$src1, VR256:$src2, VR256:$src3)>; +} + +multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128, + Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> { + def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set VR128:$dst, + (Int128 VR128:$src1, VR128:$src2, VR128:$src3, imm:$src4))]>; + def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, f128mem:$src3, u8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set VR128:$dst, + (Int128 VR128:$src1, VR128:$src2, (ld_128 addr:$src3), imm:$src4))]>, + VEX_W, MemOp4; + def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set VR128:$dst, + (Int128 VR128:$src1, (ld_128 addr:$src2), VR128:$src3, imm:$src4))]>; + def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set VR256:$dst, + (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L; + def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, f256mem:$src3, u8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set VR256:$dst, + (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>, + VEX_W, MemOp4, VEX_L; + def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set VR256:$dst, + (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>, + VEX_L; +} + +let ExeDomain = SSEPackedDouble in + defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd, + int_x86_xop_vpermil2pd_256, loadv2f64, loadv4f64>; + +let ExeDomain = SSEPackedSingle in + defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps, + int_x86_xop_vpermil2ps_256, loadv4f32, loadv8f32>; + diff --git a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h new file mode 100644 index 0000000..646b556 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -0,0 +1,2042 @@ +//===-- X86IntrinsicsInfo.h - X86 Intrinsics ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the details for lowering X86 intrinsics +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H +#define LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H + +namespace llvm { + +enum IntrinsicType { + INTR_NO_TYPE, + GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, FPCLASS, FPCLASSS, + INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_2OP_IMM8, INTR_TYPE_3OP, INTR_TYPE_4OP, + CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, VSHIFT_MASK, COMI, COMI_RM, + INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, + INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK, + INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK, + FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK, + VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK, + INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM, + COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, + TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, + EXPAND_FROM_MEM, BLEND, INSERT_SUBVEC, + TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK +}; + +struct IntrinsicData { + + unsigned Id; + IntrinsicType Type; + unsigned Opc0; + unsigned Opc1; + + bool operator<(const IntrinsicData &RHS) const { + return Id < RHS.Id; + } + bool operator==(const IntrinsicData &RHS) const { + return RHS.Id == Id; + } +}; + +#define X86_INTRINSIC_DATA(id, type, op0, op1) \ + { Intrinsic::x86_##id, type, op0, op1 } + +/* + * IntrinsicsWithChain - the table should be sorted by Intrinsic ID - in + * the alphabetical order. + */ +static const IntrinsicData IntrinsicsWithChain[] = { + X86_INTRINSIC_DATA(addcarry_u32, ADX, X86ISD::ADC, 0), + X86_INTRINSIC_DATA(addcarry_u64, ADX, X86ISD::ADC, 0), + X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0), + X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0), + + X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0), + + X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH, + X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm), + X86_INTRINSIC_DATA(avx512_gatherpf_dps_512, PREFETCH, + X86::VGATHERPF0DPSm, X86::VGATHERPF1DPSm), + X86_INTRINSIC_DATA(avx512_gatherpf_qpd_512, PREFETCH, + X86::VGATHERPF0QPDm, X86::VGATHERPF1QPDm), + X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH, + X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm), + + X86_INTRINSIC_DATA(avx512_mask_compress_store_d_128, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_d_256, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_d_512, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_128, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_256, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_512, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_128, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_256, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_512, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_q_128, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_q_256, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_q_512, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_d_128, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_d_256, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_d_512, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_128, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_256, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_512, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_128, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_256, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_512, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_q_128, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_q_256, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_128, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_256, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_512, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_128, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_256, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_512, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_128, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_256, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_512, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv2_df, SCATTER, X86::VSCATTERQPDZ128mr, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv2_di, SCATTER, X86::VPSCATTERQQZ128mr, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv4_df, SCATTER, X86::VSCATTERQPDZ256mr, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv4_di, SCATTER, X86::VPSCATTERQQZ256mr, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv4_sf, SCATTER, X86::VSCATTERQPSZ128mr, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv4_si, SCATTER, X86::VPSCATTERQDZ128mr, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv8_sf, SCATTER, X86::VSCATTERQPSZ256mr, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv8_si, SCATTER, X86::VPSCATTERQDZ256mr, 0), + X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH, X86::VSCATTERPF0DPDm, + X86::VSCATTERPF1DPDm), + X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH, X86::VSCATTERPF0DPSm, + X86::VSCATTERPF1DPSm), + X86_INTRINSIC_DATA(avx512_scatterpf_qpd_512, PREFETCH, X86::VSCATTERPF0QPDm, + X86::VSCATTERPF1QPDm), + X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH, X86::VSCATTERPF0QPSm, + X86::VSCATTERPF1QPSm), + X86_INTRINSIC_DATA(avx512_scattersiv2_df, SCATTER, X86::VSCATTERDPDZ128mr, 0), + X86_INTRINSIC_DATA(avx512_scattersiv2_di, SCATTER, X86::VPSCATTERDQZ128mr, 0), + X86_INTRINSIC_DATA(avx512_scattersiv4_df, SCATTER, X86::VSCATTERDPDZ256mr, 0), + X86_INTRINSIC_DATA(avx512_scattersiv4_di, SCATTER, X86::VPSCATTERDQZ256mr, 0), + X86_INTRINSIC_DATA(avx512_scattersiv4_sf, SCATTER, X86::VSCATTERDPSZ128mr, 0), + X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0), + X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0), + X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0), + + X86_INTRINSIC_DATA(rdpmc, RDPMC, X86ISD::RDPMC_DAG, 0), + X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0), + X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0), + X86_INTRINSIC_DATA(rdrand_64, RDRAND, X86ISD::RDRAND, 0), + X86_INTRINSIC_DATA(rdseed_16, RDSEED, X86ISD::RDSEED, 0), + X86_INTRINSIC_DATA(rdseed_32, RDSEED, X86ISD::RDSEED, 0), + X86_INTRINSIC_DATA(rdseed_64, RDSEED, X86ISD::RDSEED, 0), + X86_INTRINSIC_DATA(rdtsc, RDTSC, X86ISD::RDTSC_DAG, 0), + X86_INTRINSIC_DATA(rdtscp, RDTSC, X86ISD::RDTSCP_DAG, 0), + + X86_INTRINSIC_DATA(subborrow_u32, ADX, X86ISD::SBB, 0), + X86_INTRINSIC_DATA(subborrow_u64, ADX, X86ISD::SBB, 0), + X86_INTRINSIC_DATA(xtest, XTEST, X86ISD::XTEST, 0), +}; + +/* + * Find Intrinsic data by intrinsic ID + */ +static const IntrinsicData* getIntrinsicWithChain(unsigned IntNo) { + + IntrinsicData IntrinsicToFind = {IntNo, INTR_NO_TYPE, 0, 0 }; + const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithChain), + std::end(IntrinsicsWithChain), + IntrinsicToFind); + if (Data != std::end(IntrinsicsWithChain) && *Data == IntrinsicToFind) + return Data; + return nullptr; +} + +/* + * IntrinsicsWithoutChain - the table should be sorted by Intrinsic ID - in + * the alphabetical order. + */ +static const IntrinsicData IntrinsicsWithoutChain[] = { + X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(avx2_pmaxs_b, INTR_TYPE_2OP, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxs_d, INTR_TYPE_2OP, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxu_d, INTR_TYPE_2OP, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxu_w, INTR_TYPE_2OP, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx2_pmins_b, INTR_TYPE_2OP, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx2_pmins_d, INTR_TYPE_2OP, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx2_pminu_b, INTR_TYPE_2OP, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx2_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxdq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxwd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), + X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), + X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), + X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0), + X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), + X86_INTRINSIC_DATA(avx2_psign_b, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(avx2_psign_d, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(avx2_psign_w, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(avx2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx2_pslli_d, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx2_pslli_q, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx2_pslli_w, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx2_psllv_d, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx2_psllv_d_256, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx2_psllv_q, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx2_psllv_q_256, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx2_psrai_d, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx2_psrai_w, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx2_psrav_d, INTR_TYPE_2OP, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx2_psrav_d_256, INTR_TYPE_2OP, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx2_psrli_d, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx2_psrli_q, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx2_psrli_w, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx2_psrlv_d, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtd2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtd2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtd2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2b_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2b_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2b_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2d_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2d_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2d_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2q_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2q_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2q_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2w_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2w_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2w_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtsi2sd32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtw2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtw2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), + X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), + X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0), + X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0), + X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD, + X86ISD::FMADD_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_128, FMA_OP_MASK3, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_256, FMA_OP_MASK3, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, X86ISD::FMADD, + X86ISD::FMADD_RND), + + X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB, + X86ISD::FMADDSUB_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_512, FMA_OP_MASK3, X86ISD::FMADDSUB, + X86ISD::FMADDSUB_RND), + + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_128, FMA_OP_MASK3, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_256, FMA_OP_MASK3, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_512, FMA_OP_MASK3, X86ISD::FMSUB, + X86ISD::FMSUB_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_128, FMA_OP_MASK3, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB, + X86ISD::FMSUB_RND), + + X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_512, FMA_OP_MASK3, X86ISD::FMSUBADD, + X86ISD::FMSUBADD_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_512, FMA_OP_MASK3, X86ISD::FMSUBADD, + X86ISD::FMSUBADD_RND), + + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_512, FMA_OP_MASK3, X86ISD::FNMSUB, + X86ISD::FNMSUB_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB, + X86ISD::FNMSUB_RND), + + X86_INTRINSIC_DATA(avx512_mask_add_pd_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0), + X86_INTRINSIC_DATA(avx512_mask_add_pd_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0), + X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD, + X86ISD::FADD_RND), + X86_INTRINSIC_DATA(avx512_mask_add_ps_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0), + X86_INTRINSIC_DATA(avx512_mask_add_ps_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0), + X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD, + X86ISD::FADD_RND), + X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FADD, + X86ISD::FADD_RND), + X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FADD, + X86ISD::FADD_RND), + X86_INTRINSIC_DATA(avx512_mask_and_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0), + X86_INTRINSIC_DATA(avx512_mask_and_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0), + X86_INTRINSIC_DATA(avx512_mask_and_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0), + X86_INTRINSIC_DATA(avx512_mask_and_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0), + X86_INTRINSIC_DATA(avx512_mask_and_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0), + X86_INTRINSIC_DATA(avx512_mask_and_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FAND, 0), + X86_INTRINSIC_DATA(avx512_mask_andn_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0), + X86_INTRINSIC_DATA(avx512_mask_andn_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0), + X86_INTRINSIC_DATA(avx512_mask_andn_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0), + X86_INTRINSIC_DATA(avx512_mask_andn_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0), + X86_INTRINSIC_DATA(avx512_mask_andn_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0), + X86_INTRINSIC_DATA(avx512_mask_andn_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FANDN, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_b_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_b_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_b_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_d_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_d_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_d_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_pd_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_pd_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_pd_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_ps_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_ps_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_ps_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_q_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_q_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_q_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_w_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_w_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_w_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_256, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x8_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_256, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf64x4_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_128, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_256, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_512, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_256, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x8_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_256, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_b_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_b_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_b_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_d_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_d_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_d_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM, + X86ISD::CMPM_RND), + X86_INTRINSIC_DATA(avx512_mask_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM, + X86ISD::CMPM_RND), + X86_INTRINSIC_DATA(avx512_mask_cmp_q_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_q_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_q_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC, X86ISD::FSETCC, + X86ISD::FSETCC), + X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC, X86ISD::FSETCC, + X86ISD::FSETCC), + X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_d_512, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_pd_128, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_pd_256, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_pd_512, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_ps_128, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_ps_256, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_ps_512, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_q_128, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_q_256, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_d_128, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_d_256, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_d_512, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_q_128, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_q_256, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_q_512, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTDQ2PD, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_256, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_512, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), // no rm + X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_128, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_256, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_512, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, ISD::SINT_TO_FP), //er + X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps, INTR_TYPE_1OP_MASK, + X86ISD::VFPROUND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_256, INTR_TYPE_1OP_MASK_RM, + ISD::FP_ROUND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, INTR_TYPE_1OP_MASK_RM, + ISD::FP_ROUND, X86ISD::VFPROUND), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_128, INTR_TYPE_1OP_MASK, + X86ISD::VFPEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_256, INTR_TYPE_1OP_MASK, + ISD::FP_EXTEND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK, + ISD::FP_EXTEND, X86ISD::VFPEXT), + X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_128, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_256, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_512, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, ISD::SINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_256, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_512, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, ISD::SINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::VFPROUND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::VFPEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, ISD::FP_TO_SINT), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, ISD::FP_TO_SINT), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, ISD::FP_TO_UINT), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, ISD::FP_TO_UINT), + X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, ISD::FP_TO_SINT), + X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, ISD::FP_TO_SINT), + X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, ISD::FP_TO_UINT), + X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, ISD::FP_TO_UINT), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTUDQ2PD, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_256, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_512, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), // no rm + X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_128, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_256, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_512, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, ISD::UINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_128, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_256, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_512, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, ISD::UINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_256, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_512, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, ISD::UINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::DBPSADBW, 0), + X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::DBPSADBW, 0), + X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::DBPSADBW, 0), + X86_INTRINSIC_DATA(avx512_mask_div_pd_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), + X86_INTRINSIC_DATA(avx512_mask_div_pd_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), + X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV, + X86ISD::FDIV_RND), + X86_INTRINSIC_DATA(avx512_mask_div_ps_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), + X86_INTRINSIC_DATA(avx512_mask_div_ps_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), + X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV, + X86ISD::FDIV_RND), + X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FDIV, + X86ISD::FDIV_RND), + X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FDIV, + X86ISD::FDIV_RND), + X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_d_256, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_d_512, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_pd_128, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_pd_256, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_pd_512, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_ps_128, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_ps_256, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_ps_512, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_q_128, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_q_256, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_128, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_256, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_512, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_sd, FPCLASSS, X86ISD::VFPCLASSS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_ss, FPCLASSS, X86ISD::VFPCLASSS, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM, + X86ISD::FGETEXP_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK_RM, + X86ISD::FGETEXP_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_pd_512, INTR_TYPE_1OP_MASK_RM, + X86ISD::FGETEXP_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_ps_128, INTR_TYPE_1OP_MASK_RM, + X86ISD::FGETEXP_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_ps_256, INTR_TYPE_1OP_MASK_RM, + X86ISD::FGETEXP_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM, + X86ISD::FGETEXP_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FGETEXP_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FGETEXP_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf32x4_256, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf32x4_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf32x8_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf64x2_256, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf64x2_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf64x4_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti32x4_256, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti32x4_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti32x8_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti64x2_256, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti64x2_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti64x4_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_512, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_128, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_256, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_512, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, + X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_mask_max_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, + X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMAX, X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMAX, X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, + X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_min_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, + X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMIN, X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMIN, X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_movddup_128, INTR_TYPE_1OP_MASK, + X86ISD::MOVDDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movddup_256, INTR_TYPE_1OP_MASK, + X86ISD::MOVDDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movddup_512, INTR_TYPE_1OP_MASK, + X86ISD::MOVDDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_move_sd, INTR_TYPE_SCALAR_MASK, + X86ISD::MOVSD, 0), + X86_INTRINSIC_DATA(avx512_mask_move_ss, INTR_TYPE_SCALAR_MASK, + X86ISD::MOVSS, 0), + X86_INTRINSIC_DATA(avx512_mask_movshdup_128, INTR_TYPE_1OP_MASK, + X86ISD::MOVSHDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movshdup_256, INTR_TYPE_1OP_MASK, + X86ISD::MOVSHDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movshdup_512, INTR_TYPE_1OP_MASK, + X86ISD::MOVSHDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movsldup_128, INTR_TYPE_1OP_MASK, + X86ISD::MOVSLDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movsldup_256, INTR_TYPE_1OP_MASK, + X86ISD::MOVSLDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movsldup_512, INTR_TYPE_1OP_MASK, + X86ISD::MOVSLDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_mul_pd_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), + X86_INTRINSIC_DATA(avx512_mask_mul_pd_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), + X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL, + X86ISD::FMUL_RND), + X86_INTRINSIC_DATA(avx512_mask_mul_ps_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), + X86_INTRINSIC_DATA(avx512_mask_mul_ps_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), + X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL, + X86ISD::FMUL_RND), + X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FMUL, + X86ISD::FMUL_RND), + X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FMUL, + X86ISD::FMUL_RND), + X86_INTRINSIC_DATA(avx512_mask_or_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), + X86_INTRINSIC_DATA(avx512_mask_or_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), + X86_INTRINSIC_DATA(avx512_mask_or_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), + X86_INTRINSIC_DATA(avx512_mask_or_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), + X86_INTRINSIC_DATA(avx512_mask_or_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), + X86_INTRINSIC_DATA(avx512_mask_or_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_b_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_b_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_b_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_d_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_d_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_d_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_q_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_q_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_q_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_w_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_w_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_w_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_packssdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx512_mask_packssdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx512_mask_packssdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx512_mask_packsswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx512_mask_packsswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx512_mask_packsswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx512_mask_packusdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx512_mask_packusdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx512_mask_packusdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx512_mask_packuswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx512_mask_packuswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx512_mask_packuswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_b_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_b_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_b_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_d_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_d_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_d_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_q_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_q_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_q_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_w_128, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_w_256, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padd_w_512, INTR_TYPE_2OP_MASK, ISD::ADD, 0), + X86_INTRINSIC_DATA(avx512_mask_padds_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_mask_padds_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_mask_padds_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_mask_padds_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_mask_padds_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_mask_padds_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_mask_paddus_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_paddus_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_paddus_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_palignr_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::PALIGNR, 0), + X86_INTRINSIC_DATA(avx512_mask_palignr_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::PALIGNR, 0), + X86_INTRINSIC_DATA(avx512_mask_palignr_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::PALIGNR, 0), + X86_INTRINSIC_DATA(avx512_mask_pand_d_128, INTR_TYPE_2OP_MASK, ISD::AND, 0), + X86_INTRINSIC_DATA(avx512_mask_pand_d_256, INTR_TYPE_2OP_MASK, ISD::AND, 0), + X86_INTRINSIC_DATA(avx512_mask_pand_d_512, INTR_TYPE_2OP_MASK, ISD::AND, 0), + X86_INTRINSIC_DATA(avx512_mask_pand_q_128, INTR_TYPE_2OP_MASK, ISD::AND, 0), + X86_INTRINSIC_DATA(avx512_mask_pand_q_256, INTR_TYPE_2OP_MASK, ISD::AND, 0), + X86_INTRINSIC_DATA(avx512_mask_pand_q_512, INTR_TYPE_2OP_MASK, ISD::AND, 0), + X86_INTRINSIC_DATA(avx512_mask_pandn_d_128, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0), + X86_INTRINSIC_DATA(avx512_mask_pandn_d_256, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0), + X86_INTRINSIC_DATA(avx512_mask_pandn_d_512, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0), + X86_INTRINSIC_DATA(avx512_mask_pandn_q_128, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0), + X86_INTRINSIC_DATA(avx512_mask_pandn_q_256, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0), + X86_INTRINSIC_DATA(avx512_mask_pandn_q_512, INTR_TYPE_2OP_MASK, X86ISD::ANDNP, 0), + X86_INTRINSIC_DATA(avx512_mask_pavg_b_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_mask_pavg_b_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_mask_pavg_b_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_mask_pavg_w_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_mask_pavg_w_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_mask_pavg_w_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_128, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_256, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_512, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_128, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_256, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_512, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_128, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_256, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_512, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_128, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_256, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_128, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_256, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_128, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_256, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_128, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_256, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_512, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_128, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDWD, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_256, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDWD, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_512, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDWD, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_d_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_q_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxs_w_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_b_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_d_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_q_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_128, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_256, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaxu_w_512, INTR_TYPE_2OP_MASK, ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_b_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_b_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_b_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_d_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_d_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_d_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_q_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_q_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_q_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_w_128, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_w_256, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmins_w_512, INTR_TYPE_2OP_MASK, ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_b_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_b_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_b_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_d_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_d_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_d_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_q_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_q_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_q_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_w_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_w_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pminu_w_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmul_dq_128, INTR_TYPE_2OP_MASK, + X86ISD::PMULDQ, 0), + X86_INTRINSIC_DATA(avx512_mask_pmul_dq_256, INTR_TYPE_2OP_MASK, + X86ISD::PMULDQ, 0), + X86_INTRINSIC_DATA(avx512_mask_pmul_dq_512, INTR_TYPE_2OP_MASK, + X86ISD::PMULDQ, 0), + X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_128, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_256, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_512, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmulh_w_128, INTR_TYPE_2OP_MASK, ISD::MULHS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmulh_w_256, INTR_TYPE_2OP_MASK, ISD::MULHS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmulh_w_512, INTR_TYPE_2OP_MASK, ISD::MULHS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_128, INTR_TYPE_2OP_MASK, ISD::MULHU, 0), + X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_256, INTR_TYPE_2OP_MASK, ISD::MULHU, 0), + X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_512, INTR_TYPE_2OP_MASK, ISD::MULHU, 0), + X86_INTRINSIC_DATA(avx512_mask_pmull_d_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0), + X86_INTRINSIC_DATA(avx512_mask_pmull_d_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0), + X86_INTRINSIC_DATA(avx512_mask_pmull_d_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0), + X86_INTRINSIC_DATA(avx512_mask_pmull_q_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0), + X86_INTRINSIC_DATA(avx512_mask_pmull_q_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0), + X86_INTRINSIC_DATA(avx512_mask_pmull_q_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0), + X86_INTRINSIC_DATA(avx512_mask_pmull_w_128, INTR_TYPE_2OP_MASK, ISD::MUL, 0), + X86_INTRINSIC_DATA(avx512_mask_pmull_w_256, INTR_TYPE_2OP_MASK, ISD::MUL, 0), + X86_INTRINSIC_DATA(avx512_mask_pmull_w_512, INTR_TYPE_2OP_MASK, ISD::MUL, 0), + X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_128, INTR_TYPE_2OP_MASK, + X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_256, INTR_TYPE_2OP_MASK, + X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(avx512_mask_pmulu_dq_512, INTR_TYPE_2OP_MASK, + X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(avx512_mask_por_d_128, INTR_TYPE_2OP_MASK, ISD::OR, 0), + X86_INTRINSIC_DATA(avx512_mask_por_d_256, INTR_TYPE_2OP_MASK, ISD::OR, 0), + X86_INTRINSIC_DATA(avx512_mask_por_d_512, INTR_TYPE_2OP_MASK, ISD::OR, 0), + X86_INTRINSIC_DATA(avx512_mask_por_q_128, INTR_TYPE_2OP_MASK, ISD::OR, 0), + X86_INTRINSIC_DATA(avx512_mask_por_q_256, INTR_TYPE_2OP_MASK, ISD::OR, 0), + X86_INTRINSIC_DATA(avx512_mask_por_q_512, INTR_TYPE_2OP_MASK, ISD::OR, 0), + X86_INTRINSIC_DATA(avx512_mask_pshuf_b_128, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFB, 0), + X86_INTRINSIC_DATA(avx512_mask_pshuf_b_256, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFB, 0), + X86_INTRINSIC_DATA(avx512_mask_pshuf_b_512, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFB, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_d, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_di_128, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_di_256, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_di_512, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_q, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_qi_128, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_qi_256, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_qi_512, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_pslli_d, VSHIFT_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_pslli_q, VSHIFT_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv_d, INTR_TYPE_2OP_MASK, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv_q, INTR_TYPE_2OP_MASK, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_d, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_di_128, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_di_256, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_di_512, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_q, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_qi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_qi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_qi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_wi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_wi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_wi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrai_d, VSHIFT_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrai_q, VSHIFT_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav_d, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav_q, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_d, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_di_128, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_di_256, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_di_512, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_q, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_q_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_q_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_qi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_qi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_qi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_wi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_wi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_wi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrli_d, VSHIFT_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrli_q, VSHIFT_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv16_hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv2_di, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv32hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv4_di, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv4_si, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv8_hi, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv8_si, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv_d, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv_q, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_b_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_b_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_b_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_d_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_d_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_d_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_q_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_q_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_q_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_w_128, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_w_256, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psub_w_512, INTR_TYPE_2OP_MASK, ISD::SUB, 0), + X86_INTRINSIC_DATA(avx512_mask_psubs_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubs_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubs_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubs_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubs_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubs_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubus_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubus_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubus_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_d_128, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_d_256, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_d_512, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_q_128, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_q_256, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_q_512, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckld_q_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckld_q_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckld_q_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_pxor_d_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0), + X86_INTRINSIC_DATA(avx512_mask_pxor_d_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0), + X86_INTRINSIC_DATA(avx512_mask_pxor_d_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0), + X86_INTRINSIC_DATA(avx512_mask_pxor_q_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0), + X86_INTRINSIC_DATA(avx512_mask_pxor_q_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0), + X86_INTRINSIC_DATA(avx512_mask_pxor_q_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0), + X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_pd_512, INTR_TYPE_2OP_MASK_RM, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_ps_128, INTR_TYPE_2OP_MASK_RM, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_ps_256, INTR_TYPE_2OP_MASK_RM, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK_RM, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_pd_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_pd_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_pd_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_ps_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_ps_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_ps_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), + X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), + X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT, + X86ISD::FSQRT_RND), + X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), + X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), + X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT, + X86ISD::FSQRT_RND), + X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FSQRT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FSQRT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_sub_pd_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_sub_pd_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB, + X86ISD::FSUB_RND), + X86_INTRINSIC_DATA(avx512_mask_sub_ps_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_sub_ps_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB, + X86ISD::FSUB_RND), + X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FSUB, + X86ISD::FSUB_RND), + X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM, ISD::FSUB, + X86ISD::FSUB_RND), + X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_d_128, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_d_256, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_d_512, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_q_128, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_q_256, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_q_512, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_d_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_d_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_q_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_q_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM, + ISD::FP16_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM, + ISD::FP16_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_RM, + ISD::FP16_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK_RM, + ISD::FP_TO_FP16, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK_RM, + ISD::FP_TO_FP16, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK_RM, + ISD::FP_TO_FP16, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, X86ISD::FMADD, + X86ISD::FMADD_RND), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_128, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_256, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD, + X86ISD::FMADD_RND), + + X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB, + X86ISD::FMADDSUB_RND), + X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_512, FMA_OP_MASK, X86ISD::FMADDSUB, + X86ISD::FMADDSUB_RND), + + X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_128, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_256, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_512, FMA_OP_MASK, X86ISD::FNMADD, + X86ISD::FNMADD_RND), + X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_128, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_256, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_512, FMA_OP_MASK, X86ISD::FNMADD, + X86ISD::FNMADD_RND), + + X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_128, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_512, FMA_OP_MASK, X86ISD::FNMSUB, + X86ISD::FNMSUB_RND), + X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_128, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB, + X86ISD::FNMSUB_RND), + + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_128, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_256, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_512, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_128, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_256, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_512, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_128, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_256, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_512, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_128, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_256, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_512, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_128, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_256, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_512, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_128, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_256, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_512, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_128, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_256, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_512, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_128, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_256, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_512, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_128, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_256, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_512, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_512, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_512, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_512, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_512, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_xor_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), + X86_INTRINSIC_DATA(avx512_mask_xor_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), + X86_INTRINSIC_DATA(avx512_mask_xor_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), + X86_INTRINSIC_DATA(avx512_mask_xor_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), + X86_INTRINSIC_DATA(avx512_mask_xor_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), + X86_INTRINSIC_DATA(avx512_mask_xor_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_128, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_256, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_512, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_128, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_256, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_512, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, X86ISD::FMADD, + X86ISD::FMADD_RND), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_128, FMA_OP_MASKZ, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_256, FMA_OP_MASKZ, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, X86ISD::FMADD, + X86ISD::FMADD_RND), + + X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB, + X86ISD::FMADDSUB_RND), + X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_512, FMA_OP_MASKZ, X86ISD::FMADDSUB, + X86ISD::FMADDSUB_RND), + + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_128, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_256, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_512, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_128, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_256, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_512, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_128, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_256, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_512, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_128, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_256, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_512, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_128, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_256, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_512, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastb_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastb_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastb_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastd_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastd_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastd_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastq_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastq_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastq_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastw_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastw_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastw_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0), + X86_INTRINSIC_DATA(avx512_psll_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSHLDQ, 0), + X86_INTRINSIC_DATA(avx512_psrl_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSRLDQ, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), + X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), + X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0), + X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0), + X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), + X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), + X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), + X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), + X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0), + X86_INTRINSIC_DATA(avx_hsub_ps_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0), + X86_INTRINSIC_DATA(avx_max_pd_256, INTR_TYPE_2OP, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx_max_ps_256, INTR_TYPE_2OP, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx_min_pd_256, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx_min_ps_256, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx_rcp_ps_256, INTR_TYPE_1OP, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx_rsqrt_ps_256, INTR_TYPE_1OP, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0), + X86_INTRINSIC_DATA(avx_sqrt_ps_256, INTR_TYPE_1OP, ISD::FSQRT, 0), + X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(fma_vfmadd_pd, INTR_TYPE_3OP, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_vfmadd_ps_256, INTR_TYPE_3OP, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_vfmsub_pd, INTR_TYPE_3OP, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_vfmsub_pd_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_vfmsub_ps, INTR_TYPE_3OP, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_vfmsub_ps_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_vfmsubadd_pd, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_vfmsubadd_pd_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_vfmsubadd_ps, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_vfmsubadd_ps_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_vfnmadd_pd, INTR_TYPE_3OP, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_vfnmadd_pd_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_vfnmadd_ps, INTR_TYPE_3OP, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_vfnmadd_ps_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_vfnmsub_pd, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_vfnmsub_pd_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_vfnmsub_ps, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_vfnmsub_ps_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(sse2_comieq_sd, COMI, X86ISD::COMI, ISD::SETEQ), + X86_INTRINSIC_DATA(sse2_comige_sd, COMI, X86ISD::COMI, ISD::SETGE), + X86_INTRINSIC_DATA(sse2_comigt_sd, COMI, X86ISD::COMI, ISD::SETGT), + X86_INTRINSIC_DATA(sse2_comile_sd, COMI, X86ISD::COMI, ISD::SETLE), + X86_INTRINSIC_DATA(sse2_comilt_sd, COMI, X86ISD::COMI, ISD::SETLT), + X86_INTRINSIC_DATA(sse2_comineq_sd, COMI, X86ISD::COMI, ISD::SETNE), + X86_INTRINSIC_DATA(sse2_max_pd, INTR_TYPE_2OP, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(sse2_min_pd, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(sse2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0), + X86_INTRINSIC_DATA(sse2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0), + X86_INTRINSIC_DATA(sse2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0), + X86_INTRINSIC_DATA(sse2_pminu_b, INTR_TYPE_2OP, ISD::UMIN, 0), + X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), + X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), + X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(sse2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0), + X86_INTRINSIC_DATA(sse2_pshuf_d, INTR_TYPE_2OP, X86ISD::PSHUFD, 0), + X86_INTRINSIC_DATA(sse2_pshufh_w, INTR_TYPE_2OP, X86ISD::PSHUFHW, 0), + X86_INTRINSIC_DATA(sse2_pshufl_w, INTR_TYPE_2OP, X86ISD::PSHUFLW, 0), + X86_INTRINSIC_DATA(sse2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(sse2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(sse2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(sse2_pslli_d, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(sse2_pslli_q, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(sse2_pslli_w, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(sse2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(sse2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(sse2_psrai_d, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(sse2_psrai_w, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(sse2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(sse2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(sse2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(sse2_psrli_d, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(sse2_psrli_q, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(sse2_psrli_w, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(sse2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(sse2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(sse2_sqrt_pd, INTR_TYPE_1OP, ISD::FSQRT, 0), + X86_INTRINSIC_DATA(sse2_ucomieq_sd, COMI, X86ISD::UCOMI, ISD::SETEQ), + X86_INTRINSIC_DATA(sse2_ucomige_sd, COMI, X86ISD::UCOMI, ISD::SETGE), + X86_INTRINSIC_DATA(sse2_ucomigt_sd, COMI, X86ISD::UCOMI, ISD::SETGT), + X86_INTRINSIC_DATA(sse2_ucomile_sd, COMI, X86ISD::UCOMI, ISD::SETLE), + X86_INTRINSIC_DATA(sse2_ucomilt_sd, COMI, X86ISD::UCOMI, ISD::SETLT), + X86_INTRINSIC_DATA(sse2_ucomineq_sd, COMI, X86ISD::UCOMI, ISD::SETNE), + X86_INTRINSIC_DATA(sse3_hadd_pd, INTR_TYPE_2OP, X86ISD::FHADD, 0), + X86_INTRINSIC_DATA(sse3_hadd_ps, INTR_TYPE_2OP, X86ISD::FHADD, 0), + X86_INTRINSIC_DATA(sse3_hsub_pd, INTR_TYPE_2OP, X86ISD::FHSUB, 0), + X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0), + X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0), + X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse41_pmaxsb, INTR_TYPE_2OP, ISD::SMAX, 0), + X86_INTRINSIC_DATA(sse41_pmaxsd, INTR_TYPE_2OP, ISD::SMAX, 0), + X86_INTRINSIC_DATA(sse41_pmaxud, INTR_TYPE_2OP, ISD::UMAX, 0), + X86_INTRINSIC_DATA(sse41_pmaxuw, INTR_TYPE_2OP, ISD::UMAX, 0), + X86_INTRINSIC_DATA(sse41_pminsb, INTR_TYPE_2OP, ISD::SMIN, 0), + X86_INTRINSIC_DATA(sse41_pminsd, INTR_TYPE_2OP, ISD::SMIN, 0), + X86_INTRINSIC_DATA(sse41_pminud, INTR_TYPE_2OP, ISD::UMIN, 0), + X86_INTRINSIC_DATA(sse41_pminuw, INTR_TYPE_2OP, ISD::UMIN, 0), + X86_INTRINSIC_DATA(sse41_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), + X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0), + X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP, X86ISD::INSERTQI, 0), + X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ), + X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE), + X86_INTRINSIC_DATA(sse_comigt_ss, COMI, X86ISD::COMI, ISD::SETGT), + X86_INTRINSIC_DATA(sse_comile_ss, COMI, X86ISD::COMI, ISD::SETLE), + X86_INTRINSIC_DATA(sse_comilt_ss, COMI, X86ISD::COMI, ISD::SETLT), + X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE), + X86_INTRINSIC_DATA(sse_max_ps, INTR_TYPE_2OP, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(sse_min_ps, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(sse_rcp_ps, INTR_TYPE_1OP, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(sse_rsqrt_ps, INTR_TYPE_1OP, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(sse_sqrt_ps, INTR_TYPE_1OP, ISD::FSQRT, 0), + X86_INTRINSIC_DATA(sse_ucomieq_ss, COMI, X86ISD::UCOMI, ISD::SETEQ), + X86_INTRINSIC_DATA(sse_ucomige_ss, COMI, X86ISD::UCOMI, ISD::SETGE), + X86_INTRINSIC_DATA(sse_ucomigt_ss, COMI, X86ISD::UCOMI, ISD::SETGT), + X86_INTRINSIC_DATA(sse_ucomile_ss, COMI, X86ISD::UCOMI, ISD::SETLE), + X86_INTRINSIC_DATA(sse_ucomilt_ss, COMI, X86ISD::UCOMI, ISD::SETLT), + X86_INTRINSIC_DATA(sse_ucomineq_ss, COMI, X86ISD::UCOMI, ISD::SETNE), + X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), + X86_INTRINSIC_DATA(ssse3_psign_b_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(ssse3_psign_d_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(xop_vpcomb, INTR_TYPE_3OP, X86ISD::VPCOM, 0), + X86_INTRINSIC_DATA(xop_vpcomd, INTR_TYPE_3OP, X86ISD::VPCOM, 0), + X86_INTRINSIC_DATA(xop_vpcomq, INTR_TYPE_3OP, X86ISD::VPCOM, 0), + X86_INTRINSIC_DATA(xop_vpcomub, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), + X86_INTRINSIC_DATA(xop_vpcomud, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), + X86_INTRINSIC_DATA(xop_vpcomuq, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), + X86_INTRINSIC_DATA(xop_vpcomuw, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), + X86_INTRINSIC_DATA(xop_vpcomw, INTR_TYPE_3OP, X86ISD::VPCOM, 0), + X86_INTRINSIC_DATA(xop_vprotb, INTR_TYPE_2OP, X86ISD::VPROT, 0), + X86_INTRINSIC_DATA(xop_vprotbi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), + X86_INTRINSIC_DATA(xop_vprotd, INTR_TYPE_2OP, X86ISD::VPROT, 0), + X86_INTRINSIC_DATA(xop_vprotdi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), + X86_INTRINSIC_DATA(xop_vprotq, INTR_TYPE_2OP, X86ISD::VPROT, 0), + X86_INTRINSIC_DATA(xop_vprotqi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), + X86_INTRINSIC_DATA(xop_vprotw, INTR_TYPE_2OP, X86ISD::VPROT, 0), + X86_INTRINSIC_DATA(xop_vprotwi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), + X86_INTRINSIC_DATA(xop_vpshab, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshad, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshaq, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshaw, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshlb, INTR_TYPE_2OP, X86ISD::VPSHL, 0), + X86_INTRINSIC_DATA(xop_vpshld, INTR_TYPE_2OP, X86ISD::VPSHL, 0), + X86_INTRINSIC_DATA(xop_vpshlq, INTR_TYPE_2OP, X86ISD::VPSHL, 0), + X86_INTRINSIC_DATA(xop_vpshlw, INTR_TYPE_2OP, X86ISD::VPSHL, 0) +}; + +/* + * Retrieve data for Intrinsic without chain. + * Return nullptr if intrinsic is not defined in the table. + */ +static const IntrinsicData* getIntrinsicWithoutChain(unsigned IntNo) { + IntrinsicData IntrinsicToFind = { IntNo, INTR_NO_TYPE, 0, 0 }; + const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithoutChain), + std::end(IntrinsicsWithoutChain), + IntrinsicToFind); + if (Data != std::end(IntrinsicsWithoutChain) && *Data == IntrinsicToFind) + return Data; + return nullptr; +} + +static void verifyIntrinsicTables() { + assert(std::is_sorted(std::begin(IntrinsicsWithoutChain), + std::end(IntrinsicsWithoutChain)) && + std::is_sorted(std::begin(IntrinsicsWithChain), + std::end(IntrinsicsWithChain)) && + "Intrinsic data tables should be sorted by Intrinsic ID"); + assert((std::adjacent_find(std::begin(IntrinsicsWithoutChain), + std::end(IntrinsicsWithoutChain)) == + std::end(IntrinsicsWithoutChain)) && + (std::adjacent_find(std::begin(IntrinsicsWithChain), + std::end(IntrinsicsWithChain)) == + std::end(IntrinsicsWithChain)) && + "Intrinsic data tables should have unique entries"); +} + +// X86 specific compare constants. +// They must be kept in synch with avxintrin.h +#define _X86_CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ +#define _X86_CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ +#define _X86_CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ +#define _X86_CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ +#define _X86_CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ +#define _X86_CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ +#define _X86_CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ +#define _X86_CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ +#define _X86_CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ +#define _X86_CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */ +#define _X86_CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ +#define _X86_CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ +#define _X86_CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ +#define _X86_CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ +#define _X86_CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ +#define _X86_CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ +#define _X86_CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ +#define _X86_CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ +#define _X86_CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ +#define _X86_CMP_UNORD_S 0x13 /* Unordered (signaling) */ +#define _X86_CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ +#define _X86_CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ +#define _X86_CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */ +#define _X86_CMP_ORD_S 0x17 /* Ordered (signaling) */ +#define _X86_CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ +#define _X86_CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */ +#define _X86_CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ +#define _X86_CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ +#define _X86_CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ +#define _X86_CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ +#define _X86_CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ +#define _X86_CMP_TRUE_US 0x1f /* True (unordered, signaling) */ + +/* +* Get comparison modifier from _mm_comi_round_sd/ss intrinsic +* Return tuple <isOrdered, X86 condcode> +*/ +static std::tuple<bool,unsigned> TranslateX86ConstCondToX86CC(SDValue &imm) { + ConstantSDNode *CImm = dyn_cast<ConstantSDNode>(imm); + unsigned IntImm = CImm->getZExtValue(); + // On a floating point condition, the flags are set as follows: + // ZF PF CF op + // 0 | 0 | 0 | X > Y + // 0 | 0 | 1 | X < Y + // 1 | 0 | 0 | X == Y + // 1 | 1 | 1 | unordered + switch (IntImm) { + default: llvm_unreachable("Invalid floating point compare value for Comi!"); + case _X86_CMP_EQ_OQ: // 0x00 - Equal (ordered, nonsignaling) + case _X86_CMP_EQ_OS: // 0x10 - Equal (ordered, signaling) + return std::make_tuple(true, X86::COND_E); + case _X86_CMP_EQ_UQ: // 0x08 - Equal (unordered, non-signaling) + case _X86_CMP_EQ_US: // 0x18 - Equal (unordered, signaling) + return std::make_tuple(false , X86::COND_E); + case _X86_CMP_LT_OS: // 0x01 - Less-than (ordered, signaling) + case _X86_CMP_LT_OQ: // 0x11 - Less-than (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_B); + case _X86_CMP_NGE_US: // 0x09 - Not-greater-than-or-equal (unordered, signaling) + case _X86_CMP_NGE_UQ: // 0x19 - Not-greater-than-or-equal (unordered, nonsignaling) + return std::make_tuple(false , X86::COND_B); + case _X86_CMP_LE_OS: // 0x02 - Less-than-or-equal (ordered, signaling) + case _X86_CMP_LE_OQ: // 0x12 - Less-than-or-equal (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_BE); + case _X86_CMP_NGT_US: // 0x0A - Not-greater-than (unordered, signaling) + case _X86_CMP_NGT_UQ: // 0x1A - Not-greater-than (unordered, nonsignaling) + return std::make_tuple(false, X86::COND_BE); + case _X86_CMP_GT_OS: // 0x0E - Greater-than (ordered, signaling) + case _X86_CMP_GT_OQ: // 0x1E - Greater-than (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_A); + case _X86_CMP_NLE_US: // 0x06 - Not-less-than-or-equal (unordered,signaling) + case _X86_CMP_NLE_UQ: // 0x16 - Not-less-than-or-equal (unordered, nonsignaling) + return std::make_tuple(false, X86::COND_A); + case _X86_CMP_GE_OS: // 0x0D - Greater-than-or-equal (ordered, signaling) + case _X86_CMP_GE_OQ: // 0x1D - Greater-than-or-equal (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_AE); + case _X86_CMP_NLT_US: // 0x05 - Not-less-than (unordered, signaling) + case _X86_CMP_NLT_UQ: // 0x15 - Not-less-than (unordered, nonsignaling) + return std::make_tuple(false, X86::COND_AE); + case _X86_CMP_NEQ_OQ: // 0x0C - Not-equal (ordered, non-signaling) + case _X86_CMP_NEQ_OS: // 0x1C - Not-equal (ordered, signaling) + return std::make_tuple(true, X86::COND_NE); + case _X86_CMP_NEQ_UQ: // 0x04 - Not-equal (unordered, nonsignaling) + case _X86_CMP_NEQ_US: // 0x14 - Not-equal (unordered, signaling) + return std::make_tuple(false, X86::COND_NE); + } +} + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp new file mode 100644 index 0000000..e1ca558 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -0,0 +1,1459 @@ +//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains code to lower X86 MachineInstrs to their corresponding +// MCInst records. +// +//===----------------------------------------------------------------------===// + +#include "X86AsmPrinter.h" +#include "X86RegisterInfo.h" +#include "X86ShuffleDecodeConstantPool.h" +#include "InstPrinter/X86ATTInstPrinter.h" +#include "MCTargetDesc/X86BaseInfo.h" +#include "Utils/X86ShuffleDecode.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/StackMaps.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Mangler.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstBuilder.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/TargetRegistry.h" +using namespace llvm; + +namespace { + +/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst. +class X86MCInstLower { + MCContext &Ctx; + const MachineFunction &MF; + const TargetMachine &TM; + const MCAsmInfo &MAI; + X86AsmPrinter &AsmPrinter; +public: + X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter); + + Optional<MCOperand> LowerMachineOperand(const MachineInstr *MI, + const MachineOperand &MO) const; + void Lower(const MachineInstr *MI, MCInst &OutMI) const; + + MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const; + MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const; + +private: + MachineModuleInfoMachO &getMachOMMI() const; + Mangler *getMang() const { + return AsmPrinter.Mang; + } +}; + +} // end anonymous namespace + +// Emit a minimal sequence of nops spanning NumBytes bytes. +static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, + const MCSubtargetInfo &STI); + +namespace llvm { + X86AsmPrinter::StackMapShadowTracker::StackMapShadowTracker(TargetMachine &TM) + : TM(TM), InShadow(false), RequiredShadowSize(0), CurrentShadowSize(0) {} + + X86AsmPrinter::StackMapShadowTracker::~StackMapShadowTracker() {} + + void + X86AsmPrinter::StackMapShadowTracker::startFunction(MachineFunction &F) { + MF = &F; + CodeEmitter.reset(TM.getTarget().createMCCodeEmitter( + *MF->getSubtarget().getInstrInfo(), + *MF->getSubtarget().getRegisterInfo(), MF->getContext())); + } + + void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst, + const MCSubtargetInfo &STI) { + if (InShadow) { + SmallString<256> Code; + SmallVector<MCFixup, 4> Fixups; + raw_svector_ostream VecOS(Code); + CodeEmitter->encodeInstruction(Inst, VecOS, Fixups, STI); + CurrentShadowSize += Code.size(); + if (CurrentShadowSize >= RequiredShadowSize) + InShadow = false; // The shadow is big enough. Stop counting. + } + } + + void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding( + MCStreamer &OutStreamer, const MCSubtargetInfo &STI) { + if (InShadow && CurrentShadowSize < RequiredShadowSize) { + InShadow = false; + EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize, + MF->getSubtarget<X86Subtarget>().is64Bit(), STI); + } + } + + void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) { + OutStreamer->EmitInstruction(Inst, getSubtargetInfo()); + SMShadowTracker.count(Inst, getSubtargetInfo()); + } +} // end llvm namespace + +X86MCInstLower::X86MCInstLower(const MachineFunction &mf, + X86AsmPrinter &asmprinter) + : Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()), MAI(*TM.getMCAsmInfo()), + AsmPrinter(asmprinter) {} + +MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const { + return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>(); +} + + +/// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol +/// operand to an MCSymbol. +MCSymbol *X86MCInstLower:: +GetSymbolFromOperand(const MachineOperand &MO) const { + const DataLayout &DL = MF.getDataLayout(); + assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference"); + + MCSymbol *Sym = nullptr; + SmallString<128> Name; + StringRef Suffix; + + switch (MO.getTargetFlags()) { + case X86II::MO_DLLIMPORT: + // Handle dllimport linkage. + Name += "__imp_"; + break; + case X86II::MO_DARWIN_STUB: + Suffix = "$stub"; + break; + case X86II::MO_DARWIN_NONLAZY: + case X86II::MO_DARWIN_NONLAZY_PIC_BASE: + case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: + Suffix = "$non_lazy_ptr"; + break; + } + + if (!Suffix.empty()) + Name += DL.getPrivateGlobalPrefix(); + + unsigned PrefixLen = Name.size(); + + if (MO.isGlobal()) { + const GlobalValue *GV = MO.getGlobal(); + AsmPrinter.getNameWithPrefix(Name, GV); + } else if (MO.isSymbol()) { + Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL); + } else if (MO.isMBB()) { + assert(Suffix.empty()); + Sym = MO.getMBB()->getSymbol(); + } + unsigned OrigLen = Name.size() - PrefixLen; + + Name += Suffix; + if (!Sym) + Sym = Ctx.getOrCreateSymbol(Name); + + StringRef OrigName = StringRef(Name).substr(PrefixLen, OrigLen); + + // If the target flags on the operand changes the name of the symbol, do that + // before we return the symbol. + switch (MO.getTargetFlags()) { + default: break; + case X86II::MO_DARWIN_NONLAZY: + case X86II::MO_DARWIN_NONLAZY_PIC_BASE: { + MachineModuleInfoImpl::StubValueTy &StubSym = + getMachOMMI().getGVStubEntry(Sym); + if (!StubSym.getPointer()) { + assert(MO.isGlobal() && "Extern symbol not handled yet"); + StubSym = + MachineModuleInfoImpl:: + StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()), + !MO.getGlobal()->hasInternalLinkage()); + } + break; + } + case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: { + MachineModuleInfoImpl::StubValueTy &StubSym = + getMachOMMI().getHiddenGVStubEntry(Sym); + if (!StubSym.getPointer()) { + assert(MO.isGlobal() && "Extern symbol not handled yet"); + StubSym = + MachineModuleInfoImpl:: + StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()), + !MO.getGlobal()->hasInternalLinkage()); + } + break; + } + case X86II::MO_DARWIN_STUB: { + MachineModuleInfoImpl::StubValueTy &StubSym = + getMachOMMI().getFnStubEntry(Sym); + if (StubSym.getPointer()) + return Sym; + + if (MO.isGlobal()) { + StubSym = + MachineModuleInfoImpl:: + StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()), + !MO.getGlobal()->hasInternalLinkage()); + } else { + StubSym = + MachineModuleInfoImpl:: + StubValueTy(Ctx.getOrCreateSymbol(OrigName), false); + } + break; + } + } + + return Sym; +} + +MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO, + MCSymbol *Sym) const { + // FIXME: We would like an efficient form for this, so we don't have to do a + // lot of extra uniquing. + const MCExpr *Expr = nullptr; + MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None; + + switch (MO.getTargetFlags()) { + default: llvm_unreachable("Unknown target flag on GV operand"); + case X86II::MO_NO_FLAG: // No flag. + // These affect the name of the symbol, not any suffix. + case X86II::MO_DARWIN_NONLAZY: + case X86II::MO_DLLIMPORT: + case X86II::MO_DARWIN_STUB: + break; + + case X86II::MO_TLVP: RefKind = MCSymbolRefExpr::VK_TLVP; break; + case X86II::MO_TLVP_PIC_BASE: + Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx); + // Subtract the pic base. + Expr = MCBinaryExpr::createSub(Expr, + MCSymbolRefExpr::create(MF.getPICBaseSymbol(), + Ctx), + Ctx); + break; + case X86II::MO_SECREL: RefKind = MCSymbolRefExpr::VK_SECREL; break; + case X86II::MO_TLSGD: RefKind = MCSymbolRefExpr::VK_TLSGD; break; + case X86II::MO_TLSLD: RefKind = MCSymbolRefExpr::VK_TLSLD; break; + case X86II::MO_TLSLDM: RefKind = MCSymbolRefExpr::VK_TLSLDM; break; + case X86II::MO_GOTTPOFF: RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break; + case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break; + case X86II::MO_TPOFF: RefKind = MCSymbolRefExpr::VK_TPOFF; break; + case X86II::MO_DTPOFF: RefKind = MCSymbolRefExpr::VK_DTPOFF; break; + case X86II::MO_NTPOFF: RefKind = MCSymbolRefExpr::VK_NTPOFF; break; + case X86II::MO_GOTNTPOFF: RefKind = MCSymbolRefExpr::VK_GOTNTPOFF; break; + case X86II::MO_GOTPCREL: RefKind = MCSymbolRefExpr::VK_GOTPCREL; break; + case X86II::MO_GOT: RefKind = MCSymbolRefExpr::VK_GOT; break; + case X86II::MO_GOTOFF: RefKind = MCSymbolRefExpr::VK_GOTOFF; break; + case X86II::MO_PLT: RefKind = MCSymbolRefExpr::VK_PLT; break; + case X86II::MO_PIC_BASE_OFFSET: + case X86II::MO_DARWIN_NONLAZY_PIC_BASE: + case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: + Expr = MCSymbolRefExpr::create(Sym, Ctx); + // Subtract the pic base. + Expr = MCBinaryExpr::createSub(Expr, + MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx), + Ctx); + if (MO.isJTI()) { + assert(MAI.doesSetDirectiveSuppressesReloc()); + // If .set directive is supported, use it to reduce the number of + // relocations the assembler will generate for differences between + // local labels. This is only safe when the symbols are in the same + // section so we are restricting it to jumptable references. + MCSymbol *Label = Ctx.createTempSymbol(); + AsmPrinter.OutStreamer->EmitAssignment(Label, Expr); + Expr = MCSymbolRefExpr::create(Label, Ctx); + } + break; + } + + if (!Expr) + Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx); + + if (!MO.isJTI() && !MO.isMBB() && MO.getOffset()) + Expr = MCBinaryExpr::createAdd(Expr, + MCConstantExpr::create(MO.getOffset(), Ctx), + Ctx); + return MCOperand::createExpr(Expr); +} + + +/// \brief Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with +/// a short fixed-register form. +static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) { + unsigned ImmOp = Inst.getNumOperands() - 1; + assert(Inst.getOperand(0).isReg() && + (Inst.getOperand(ImmOp).isImm() || Inst.getOperand(ImmOp).isExpr()) && + ((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() && + Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) || + Inst.getNumOperands() == 2) && "Unexpected instruction!"); + + // Check whether the destination register can be fixed. + unsigned Reg = Inst.getOperand(0).getReg(); + if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX) + return; + + // If so, rewrite the instruction. + MCOperand Saved = Inst.getOperand(ImmOp); + Inst = MCInst(); + Inst.setOpcode(Opcode); + Inst.addOperand(Saved); +} + +/// \brief If a movsx instruction has a shorter encoding for the used register +/// simplify the instruction to use it instead. +static void SimplifyMOVSX(MCInst &Inst) { + unsigned NewOpcode = 0; + unsigned Op0 = Inst.getOperand(0).getReg(), Op1 = Inst.getOperand(1).getReg(); + switch (Inst.getOpcode()) { + default: + llvm_unreachable("Unexpected instruction!"); + case X86::MOVSX16rr8: // movsbw %al, %ax --> cbtw + if (Op0 == X86::AX && Op1 == X86::AL) + NewOpcode = X86::CBW; + break; + case X86::MOVSX32rr16: // movswl %ax, %eax --> cwtl + if (Op0 == X86::EAX && Op1 == X86::AX) + NewOpcode = X86::CWDE; + break; + case X86::MOVSX64rr32: // movslq %eax, %rax --> cltq + if (Op0 == X86::RAX && Op1 == X86::EAX) + NewOpcode = X86::CDQE; + break; + } + + if (NewOpcode != 0) { + Inst = MCInst(); + Inst.setOpcode(NewOpcode); + } +} + +/// \brief Simplify things like MOV32rm to MOV32o32a. +static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst, + unsigned Opcode) { + // Don't make these simplifications in 64-bit mode; other assemblers don't + // perform them because they make the code larger. + if (Printer.getSubtarget().is64Bit()) + return; + + bool IsStore = Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg(); + unsigned AddrBase = IsStore; + unsigned RegOp = IsStore ? 0 : 5; + unsigned AddrOp = AddrBase + 3; + assert(Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() && + Inst.getOperand(AddrBase + X86::AddrBaseReg).isReg() && + Inst.getOperand(AddrBase + X86::AddrScaleAmt).isImm() && + Inst.getOperand(AddrBase + X86::AddrIndexReg).isReg() && + Inst.getOperand(AddrBase + X86::AddrSegmentReg).isReg() && + (Inst.getOperand(AddrOp).isExpr() || + Inst.getOperand(AddrOp).isImm()) && + "Unexpected instruction!"); + + // Check whether the destination register can be fixed. + unsigned Reg = Inst.getOperand(RegOp).getReg(); + if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX) + return; + + // Check whether this is an absolute address. + // FIXME: We know TLVP symbol refs aren't, but there should be a better way + // to do this here. + bool Absolute = true; + if (Inst.getOperand(AddrOp).isExpr()) { + const MCExpr *MCE = Inst.getOperand(AddrOp).getExpr(); + if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(MCE)) + if (SRE->getKind() == MCSymbolRefExpr::VK_TLVP) + Absolute = false; + } + + if (Absolute && + (Inst.getOperand(AddrBase + X86::AddrBaseReg).getReg() != 0 || + Inst.getOperand(AddrBase + X86::AddrScaleAmt).getImm() != 1 || + Inst.getOperand(AddrBase + X86::AddrIndexReg).getReg() != 0)) + return; + + // If so, rewrite the instruction. + MCOperand Saved = Inst.getOperand(AddrOp); + MCOperand Seg = Inst.getOperand(AddrBase + X86::AddrSegmentReg); + Inst = MCInst(); + Inst.setOpcode(Opcode); + Inst.addOperand(Saved); + Inst.addOperand(Seg); +} + +static unsigned getRetOpcode(const X86Subtarget &Subtarget) { + return Subtarget.is64Bit() ? X86::RETQ : X86::RETL; +} + +Optional<MCOperand> +X86MCInstLower::LowerMachineOperand(const MachineInstr *MI, + const MachineOperand &MO) const { + switch (MO.getType()) { + default: + MI->dump(); + llvm_unreachable("unknown operand type"); + case MachineOperand::MO_Register: + // Ignore all implicit register operands. + if (MO.isImplicit()) + return None; + return MCOperand::createReg(MO.getReg()); + case MachineOperand::MO_Immediate: + return MCOperand::createImm(MO.getImm()); + case MachineOperand::MO_MachineBasicBlock: + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: + return LowerSymbolOperand(MO, GetSymbolFromOperand(MO)); + case MachineOperand::MO_MCSymbol: + return LowerSymbolOperand(MO, MO.getMCSymbol()); + case MachineOperand::MO_JumpTableIndex: + return LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex())); + case MachineOperand::MO_ConstantPoolIndex: + return LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex())); + case MachineOperand::MO_BlockAddress: + return LowerSymbolOperand( + MO, AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress())); + case MachineOperand::MO_RegisterMask: + // Ignore call clobbers. + return None; + } +} + +void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { + OutMI.setOpcode(MI->getOpcode()); + + for (const MachineOperand &MO : MI->operands()) + if (auto MaybeMCOp = LowerMachineOperand(MI, MO)) + OutMI.addOperand(MaybeMCOp.getValue()); + + // Handle a few special cases to eliminate operand modifiers. +ReSimplify: + switch (OutMI.getOpcode()) { + case X86::LEA64_32r: + case X86::LEA64r: + case X86::LEA16r: + case X86::LEA32r: + // LEA should have a segment register, but it must be empty. + assert(OutMI.getNumOperands() == 1+X86::AddrNumOperands && + "Unexpected # of LEA operands"); + assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 && + "LEA has segment specified!"); + break; + + // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B + // if one of the registers is extended, but other isn't. + case X86::VMOVZPQILo2PQIrr: + case X86::VMOVAPDrr: + case X86::VMOVAPDYrr: + case X86::VMOVAPSrr: + case X86::VMOVAPSYrr: + case X86::VMOVDQArr: + case X86::VMOVDQAYrr: + case X86::VMOVDQUrr: + case X86::VMOVDQUYrr: + case X86::VMOVUPDrr: + case X86::VMOVUPDYrr: + case X86::VMOVUPSrr: + case X86::VMOVUPSYrr: { + if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) && + X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg())) { + unsigned NewOpc; + switch (OutMI.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break; + case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break; + case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break; + case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break; + case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break; + case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break; + case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break; + case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break; + case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break; + case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break; + case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break; + case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break; + case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break; + } + OutMI.setOpcode(NewOpc); + } + break; + } + case X86::VMOVSDrr: + case X86::VMOVSSrr: { + if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) && + X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) { + unsigned NewOpc; + switch (OutMI.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break; + case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break; + } + OutMI.setOpcode(NewOpc); + } + break; + } + + // TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register + // inputs modeled as normal uses instead of implicit uses. As such, truncate + // off all but the first operand (the callee). FIXME: Change isel. + case X86::TAILJMPr64: + case X86::TAILJMPr64_REX: + case X86::CALL64r: + case X86::CALL64pcrel32: { + unsigned Opcode = OutMI.getOpcode(); + MCOperand Saved = OutMI.getOperand(0); + OutMI = MCInst(); + OutMI.setOpcode(Opcode); + OutMI.addOperand(Saved); + break; + } + + case X86::EH_RETURN: + case X86::EH_RETURN64: { + OutMI = MCInst(); + OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget())); + break; + } + + case X86::CLEANUPRET: { + // Replace CATCHRET with the appropriate RET. + OutMI = MCInst(); + OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget())); + break; + } + + case X86::CATCHRET: { + // Replace CATCHRET with the appropriate RET. + const X86Subtarget &Subtarget = AsmPrinter.getSubtarget(); + unsigned ReturnReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; + OutMI = MCInst(); + OutMI.setOpcode(getRetOpcode(Subtarget)); + OutMI.addOperand(MCOperand::createReg(ReturnReg)); + break; + } + + // TAILJMPd, TAILJMPd64 - Lower to the correct jump instructions. + case X86::TAILJMPr: + case X86::TAILJMPd: + case X86::TAILJMPd64: { + unsigned Opcode; + switch (OutMI.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::TAILJMPr: Opcode = X86::JMP32r; break; + case X86::TAILJMPd: + case X86::TAILJMPd64: Opcode = X86::JMP_1; break; + } + + MCOperand Saved = OutMI.getOperand(0); + OutMI = MCInst(); + OutMI.setOpcode(Opcode); + OutMI.addOperand(Saved); + break; + } + + case X86::DEC16r: + case X86::DEC32r: + case X86::INC16r: + case X86::INC32r: + // If we aren't in 64-bit mode we can use the 1-byte inc/dec instructions. + if (!AsmPrinter.getSubtarget().is64Bit()) { + unsigned Opcode; + switch (OutMI.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::DEC16r: Opcode = X86::DEC16r_alt; break; + case X86::DEC32r: Opcode = X86::DEC32r_alt; break; + case X86::INC16r: Opcode = X86::INC16r_alt; break; + case X86::INC32r: Opcode = X86::INC32r_alt; break; + } + OutMI.setOpcode(Opcode); + } + break; + + // These are pseudo-ops for OR to help with the OR->ADD transformation. We do + // this with an ugly goto in case the resultant OR uses EAX and needs the + // short form. + case X86::ADD16rr_DB: OutMI.setOpcode(X86::OR16rr); goto ReSimplify; + case X86::ADD32rr_DB: OutMI.setOpcode(X86::OR32rr); goto ReSimplify; + case X86::ADD64rr_DB: OutMI.setOpcode(X86::OR64rr); goto ReSimplify; + case X86::ADD16ri_DB: OutMI.setOpcode(X86::OR16ri); goto ReSimplify; + case X86::ADD32ri_DB: OutMI.setOpcode(X86::OR32ri); goto ReSimplify; + case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify; + case X86::ADD16ri8_DB: OutMI.setOpcode(X86::OR16ri8); goto ReSimplify; + case X86::ADD32ri8_DB: OutMI.setOpcode(X86::OR32ri8); goto ReSimplify; + case X86::ADD64ri8_DB: OutMI.setOpcode(X86::OR64ri8); goto ReSimplify; + + // Atomic load and store require a separate pseudo-inst because Acquire + // implies mayStore and Release implies mayLoad; fix these to regular MOV + // instructions here + case X86::ACQUIRE_MOV8rm: OutMI.setOpcode(X86::MOV8rm); goto ReSimplify; + case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify; + case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify; + case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify; + case X86::RELEASE_MOV8mr: OutMI.setOpcode(X86::MOV8mr); goto ReSimplify; + case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify; + case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify; + case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify; + case X86::RELEASE_MOV8mi: OutMI.setOpcode(X86::MOV8mi); goto ReSimplify; + case X86::RELEASE_MOV16mi: OutMI.setOpcode(X86::MOV16mi); goto ReSimplify; + case X86::RELEASE_MOV32mi: OutMI.setOpcode(X86::MOV32mi); goto ReSimplify; + case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify; + case X86::RELEASE_ADD8mi: OutMI.setOpcode(X86::ADD8mi); goto ReSimplify; + case X86::RELEASE_ADD8mr: OutMI.setOpcode(X86::ADD8mr); goto ReSimplify; + case X86::RELEASE_ADD32mi: OutMI.setOpcode(X86::ADD32mi); goto ReSimplify; + case X86::RELEASE_ADD32mr: OutMI.setOpcode(X86::ADD32mr); goto ReSimplify; + case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify; + case X86::RELEASE_ADD64mr: OutMI.setOpcode(X86::ADD64mr); goto ReSimplify; + case X86::RELEASE_AND8mi: OutMI.setOpcode(X86::AND8mi); goto ReSimplify; + case X86::RELEASE_AND8mr: OutMI.setOpcode(X86::AND8mr); goto ReSimplify; + case X86::RELEASE_AND32mi: OutMI.setOpcode(X86::AND32mi); goto ReSimplify; + case X86::RELEASE_AND32mr: OutMI.setOpcode(X86::AND32mr); goto ReSimplify; + case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify; + case X86::RELEASE_AND64mr: OutMI.setOpcode(X86::AND64mr); goto ReSimplify; + case X86::RELEASE_OR8mi: OutMI.setOpcode(X86::OR8mi); goto ReSimplify; + case X86::RELEASE_OR8mr: OutMI.setOpcode(X86::OR8mr); goto ReSimplify; + case X86::RELEASE_OR32mi: OutMI.setOpcode(X86::OR32mi); goto ReSimplify; + case X86::RELEASE_OR32mr: OutMI.setOpcode(X86::OR32mr); goto ReSimplify; + case X86::RELEASE_OR64mi32: OutMI.setOpcode(X86::OR64mi32); goto ReSimplify; + case X86::RELEASE_OR64mr: OutMI.setOpcode(X86::OR64mr); goto ReSimplify; + case X86::RELEASE_XOR8mi: OutMI.setOpcode(X86::XOR8mi); goto ReSimplify; + case X86::RELEASE_XOR8mr: OutMI.setOpcode(X86::XOR8mr); goto ReSimplify; + case X86::RELEASE_XOR32mi: OutMI.setOpcode(X86::XOR32mi); goto ReSimplify; + case X86::RELEASE_XOR32mr: OutMI.setOpcode(X86::XOR32mr); goto ReSimplify; + case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify; + case X86::RELEASE_XOR64mr: OutMI.setOpcode(X86::XOR64mr); goto ReSimplify; + case X86::RELEASE_INC8m: OutMI.setOpcode(X86::INC8m); goto ReSimplify; + case X86::RELEASE_INC16m: OutMI.setOpcode(X86::INC16m); goto ReSimplify; + case X86::RELEASE_INC32m: OutMI.setOpcode(X86::INC32m); goto ReSimplify; + case X86::RELEASE_INC64m: OutMI.setOpcode(X86::INC64m); goto ReSimplify; + case X86::RELEASE_DEC8m: OutMI.setOpcode(X86::DEC8m); goto ReSimplify; + case X86::RELEASE_DEC16m: OutMI.setOpcode(X86::DEC16m); goto ReSimplify; + case X86::RELEASE_DEC32m: OutMI.setOpcode(X86::DEC32m); goto ReSimplify; + case X86::RELEASE_DEC64m: OutMI.setOpcode(X86::DEC64m); goto ReSimplify; + + // We don't currently select the correct instruction form for instructions + // which have a short %eax, etc. form. Handle this by custom lowering, for + // now. + // + // Note, we are currently not handling the following instructions: + // MOV64ao8, MOV64o8a + // XCHG16ar, XCHG32ar, XCHG64ar + case X86::MOV8mr_NOREX: + case X86::MOV8mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8o32a); break; + case X86::MOV8rm_NOREX: + case X86::MOV8rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8ao32); break; + case X86::MOV16mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16o32a); break; + case X86::MOV16rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16ao32); break; + case X86::MOV32mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32o32a); break; + case X86::MOV32rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32ao32); break; + + case X86::ADC8ri: SimplifyShortImmForm(OutMI, X86::ADC8i8); break; + case X86::ADC16ri: SimplifyShortImmForm(OutMI, X86::ADC16i16); break; + case X86::ADC32ri: SimplifyShortImmForm(OutMI, X86::ADC32i32); break; + case X86::ADC64ri32: SimplifyShortImmForm(OutMI, X86::ADC64i32); break; + case X86::ADD8ri: SimplifyShortImmForm(OutMI, X86::ADD8i8); break; + case X86::ADD16ri: SimplifyShortImmForm(OutMI, X86::ADD16i16); break; + case X86::ADD32ri: SimplifyShortImmForm(OutMI, X86::ADD32i32); break; + case X86::ADD64ri32: SimplifyShortImmForm(OutMI, X86::ADD64i32); break; + case X86::AND8ri: SimplifyShortImmForm(OutMI, X86::AND8i8); break; + case X86::AND16ri: SimplifyShortImmForm(OutMI, X86::AND16i16); break; + case X86::AND32ri: SimplifyShortImmForm(OutMI, X86::AND32i32); break; + case X86::AND64ri32: SimplifyShortImmForm(OutMI, X86::AND64i32); break; + case X86::CMP8ri: SimplifyShortImmForm(OutMI, X86::CMP8i8); break; + case X86::CMP16ri: SimplifyShortImmForm(OutMI, X86::CMP16i16); break; + case X86::CMP32ri: SimplifyShortImmForm(OutMI, X86::CMP32i32); break; + case X86::CMP64ri32: SimplifyShortImmForm(OutMI, X86::CMP64i32); break; + case X86::OR8ri: SimplifyShortImmForm(OutMI, X86::OR8i8); break; + case X86::OR16ri: SimplifyShortImmForm(OutMI, X86::OR16i16); break; + case X86::OR32ri: SimplifyShortImmForm(OutMI, X86::OR32i32); break; + case X86::OR64ri32: SimplifyShortImmForm(OutMI, X86::OR64i32); break; + case X86::SBB8ri: SimplifyShortImmForm(OutMI, X86::SBB8i8); break; + case X86::SBB16ri: SimplifyShortImmForm(OutMI, X86::SBB16i16); break; + case X86::SBB32ri: SimplifyShortImmForm(OutMI, X86::SBB32i32); break; + case X86::SBB64ri32: SimplifyShortImmForm(OutMI, X86::SBB64i32); break; + case X86::SUB8ri: SimplifyShortImmForm(OutMI, X86::SUB8i8); break; + case X86::SUB16ri: SimplifyShortImmForm(OutMI, X86::SUB16i16); break; + case X86::SUB32ri: SimplifyShortImmForm(OutMI, X86::SUB32i32); break; + case X86::SUB64ri32: SimplifyShortImmForm(OutMI, X86::SUB64i32); break; + case X86::TEST8ri: SimplifyShortImmForm(OutMI, X86::TEST8i8); break; + case X86::TEST16ri: SimplifyShortImmForm(OutMI, X86::TEST16i16); break; + case X86::TEST32ri: SimplifyShortImmForm(OutMI, X86::TEST32i32); break; + case X86::TEST64ri32: SimplifyShortImmForm(OutMI, X86::TEST64i32); break; + case X86::XOR8ri: SimplifyShortImmForm(OutMI, X86::XOR8i8); break; + case X86::XOR16ri: SimplifyShortImmForm(OutMI, X86::XOR16i16); break; + case X86::XOR32ri: SimplifyShortImmForm(OutMI, X86::XOR32i32); break; + case X86::XOR64ri32: SimplifyShortImmForm(OutMI, X86::XOR64i32); break; + + // Try to shrink some forms of movsx. + case X86::MOVSX16rr8: + case X86::MOVSX32rr16: + case X86::MOVSX64rr32: + SimplifyMOVSX(OutMI); + break; + } +} + +void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, + const MachineInstr &MI) { + + bool is64Bits = MI.getOpcode() == X86::TLS_addr64 || + MI.getOpcode() == X86::TLS_base_addr64; + + bool needsPadding = MI.getOpcode() == X86::TLS_addr64; + + MCContext &context = OutStreamer->getContext(); + + if (needsPadding) + EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + + MCSymbolRefExpr::VariantKind SRVK; + switch (MI.getOpcode()) { + case X86::TLS_addr32: + case X86::TLS_addr64: + SRVK = MCSymbolRefExpr::VK_TLSGD; + break; + case X86::TLS_base_addr32: + SRVK = MCSymbolRefExpr::VK_TLSLDM; + break; + case X86::TLS_base_addr64: + SRVK = MCSymbolRefExpr::VK_TLSLD; + break; + default: + llvm_unreachable("unexpected opcode"); + } + + MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3)); + const MCSymbolRefExpr *symRef = MCSymbolRefExpr::create(sym, SRVK, context); + + MCInst LEA; + if (is64Bits) { + LEA.setOpcode(X86::LEA64r); + LEA.addOperand(MCOperand::createReg(X86::RDI)); // dest + LEA.addOperand(MCOperand::createReg(X86::RIP)); // base + LEA.addOperand(MCOperand::createImm(1)); // scale + LEA.addOperand(MCOperand::createReg(0)); // index + LEA.addOperand(MCOperand::createExpr(symRef)); // disp + LEA.addOperand(MCOperand::createReg(0)); // seg + } else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) { + LEA.setOpcode(X86::LEA32r); + LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest + LEA.addOperand(MCOperand::createReg(X86::EBX)); // base + LEA.addOperand(MCOperand::createImm(1)); // scale + LEA.addOperand(MCOperand::createReg(0)); // index + LEA.addOperand(MCOperand::createExpr(symRef)); // disp + LEA.addOperand(MCOperand::createReg(0)); // seg + } else { + LEA.setOpcode(X86::LEA32r); + LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest + LEA.addOperand(MCOperand::createReg(0)); // base + LEA.addOperand(MCOperand::createImm(1)); // scale + LEA.addOperand(MCOperand::createReg(X86::EBX)); // index + LEA.addOperand(MCOperand::createExpr(symRef)); // disp + LEA.addOperand(MCOperand::createReg(0)); // seg + } + EmitAndCountInstruction(LEA); + + if (needsPadding) { + EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX)); + } + + StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr"; + MCSymbol *tlsGetAddr = context.getOrCreateSymbol(name); + const MCSymbolRefExpr *tlsRef = + MCSymbolRefExpr::create(tlsGetAddr, + MCSymbolRefExpr::VK_PLT, + context); + + EmitAndCountInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32 + : X86::CALLpcrel32) + .addExpr(tlsRef)); +} + +/// \brief Emit the optimal amount of multi-byte nops on X86. +static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, const MCSubtargetInfo &STI) { + // This works only for 64bit. For 32bit we have to do additional checking if + // the CPU supports multi-byte nops. + assert(Is64Bit && "EmitNops only supports X86-64"); + while (NumBytes) { + unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg; + Opc = IndexReg = Displacement = SegmentReg = 0; + BaseReg = X86::RAX; ScaleVal = 1; + switch (NumBytes) { + case 0: llvm_unreachable("Zero nops?"); break; + case 1: NumBytes -= 1; Opc = X86::NOOP; break; + case 2: NumBytes -= 2; Opc = X86::XCHG16ar; break; + case 3: NumBytes -= 3; Opc = X86::NOOPL; break; + case 4: NumBytes -= 4; Opc = X86::NOOPL; Displacement = 8; break; + case 5: NumBytes -= 5; Opc = X86::NOOPL; Displacement = 8; + IndexReg = X86::RAX; break; + case 6: NumBytes -= 6; Opc = X86::NOOPW; Displacement = 8; + IndexReg = X86::RAX; break; + case 7: NumBytes -= 7; Opc = X86::NOOPL; Displacement = 512; break; + case 8: NumBytes -= 8; Opc = X86::NOOPL; Displacement = 512; + IndexReg = X86::RAX; break; + case 9: NumBytes -= 9; Opc = X86::NOOPW; Displacement = 512; + IndexReg = X86::RAX; break; + default: NumBytes -= 10; Opc = X86::NOOPW; Displacement = 512; + IndexReg = X86::RAX; SegmentReg = X86::CS; break; + } + + unsigned NumPrefixes = std::min(NumBytes, 5U); + NumBytes -= NumPrefixes; + for (unsigned i = 0; i != NumPrefixes; ++i) + OS.EmitBytes("\x66"); + + switch (Opc) { + default: llvm_unreachable("Unexpected opcode"); break; + case X86::NOOP: + OS.EmitInstruction(MCInstBuilder(Opc), STI); + break; + case X86::XCHG16ar: + OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX), STI); + break; + case X86::NOOPL: + case X86::NOOPW: + OS.EmitInstruction(MCInstBuilder(Opc).addReg(BaseReg) + .addImm(ScaleVal).addReg(IndexReg) + .addImm(Displacement).addReg(SegmentReg), STI); + break; + } + } // while (NumBytes) +} + +void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI, + X86MCInstLower &MCIL) { + assert(Subtarget->is64Bit() && "Statepoint currently only supports X86-64"); + + StatepointOpers SOpers(&MI); + if (unsigned PatchBytes = SOpers.getNumPatchBytes()) { + EmitNops(*OutStreamer, PatchBytes, Subtarget->is64Bit(), + getSubtargetInfo()); + } else { + // Lower call target and choose correct opcode + const MachineOperand &CallTarget = SOpers.getCallTarget(); + MCOperand CallTargetMCOp; + unsigned CallOpcode; + switch (CallTarget.getType()) { + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: + CallTargetMCOp = MCIL.LowerSymbolOperand( + CallTarget, MCIL.GetSymbolFromOperand(CallTarget)); + CallOpcode = X86::CALL64pcrel32; + // Currently, we only support relative addressing with statepoints. + // Otherwise, we'll need a scratch register to hold the target + // address. You'll fail asserts during load & relocation if this + // symbol is to far away. (TODO: support non-relative addressing) + break; + case MachineOperand::MO_Immediate: + CallTargetMCOp = MCOperand::createImm(CallTarget.getImm()); + CallOpcode = X86::CALL64pcrel32; + // Currently, we only support relative addressing with statepoints. + // Otherwise, we'll need a scratch register to hold the target + // immediate. You'll fail asserts during load & relocation if this + // address is to far away. (TODO: support non-relative addressing) + break; + case MachineOperand::MO_Register: + CallTargetMCOp = MCOperand::createReg(CallTarget.getReg()); + CallOpcode = X86::CALL64r; + break; + default: + llvm_unreachable("Unsupported operand type in statepoint call target"); + break; + } + + // Emit call + MCInst CallInst; + CallInst.setOpcode(CallOpcode); + CallInst.addOperand(CallTargetMCOp); + OutStreamer->EmitInstruction(CallInst, getSubtargetInfo()); + } + + // Record our statepoint node in the same section used by STACKMAP + // and PATCHPOINT + SM.recordStatepoint(MI); +} + +void X86AsmPrinter::LowerFAULTING_LOAD_OP(const MachineInstr &MI, + X86MCInstLower &MCIL) { + // FAULTING_LOAD_OP <def>, <handler label>, <load opcode>, <load operands> + + unsigned LoadDefRegister = MI.getOperand(0).getReg(); + MCSymbol *HandlerLabel = MI.getOperand(1).getMCSymbol(); + unsigned LoadOpcode = MI.getOperand(2).getImm(); + unsigned LoadOperandsBeginIdx = 3; + + FM.recordFaultingOp(FaultMaps::FaultingLoad, HandlerLabel); + + MCInst LoadMI; + LoadMI.setOpcode(LoadOpcode); + + if (LoadDefRegister != X86::NoRegister) + LoadMI.addOperand(MCOperand::createReg(LoadDefRegister)); + + for (auto I = MI.operands_begin() + LoadOperandsBeginIdx, + E = MI.operands_end(); + I != E; ++I) + if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, *I)) + LoadMI.addOperand(MaybeOperand.getValue()); + + OutStreamer->EmitInstruction(LoadMI, getSubtargetInfo()); +} + +// Lower a stackmap of the form: +// <id>, <shadowBytes>, ... +void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) { + SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo()); + SM.recordStackMap(MI); + unsigned NumShadowBytes = MI.getOperand(1).getImm(); + SMShadowTracker.reset(NumShadowBytes); +} + +// Lower a patchpoint of the form: +// [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ... +void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI, + X86MCInstLower &MCIL) { + assert(Subtarget->is64Bit() && "Patchpoint currently only supports X86-64"); + + SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo()); + + SM.recordPatchPoint(MI); + + PatchPointOpers opers(&MI); + unsigned ScratchIdx = opers.getNextScratchIdx(); + unsigned EncodedBytes = 0; + const MachineOperand &CalleeMO = + opers.getMetaOper(PatchPointOpers::TargetPos); + + // Check for null target. If target is non-null (i.e. is non-zero or is + // symbolic) then emit a call. + if (!(CalleeMO.isImm() && !CalleeMO.getImm())) { + MCOperand CalleeMCOp; + switch (CalleeMO.getType()) { + default: + /// FIXME: Add a verifier check for bad callee types. + llvm_unreachable("Unrecognized callee operand type."); + case MachineOperand::MO_Immediate: + if (CalleeMO.getImm()) + CalleeMCOp = MCOperand::createImm(CalleeMO.getImm()); + break; + case MachineOperand::MO_ExternalSymbol: + case MachineOperand::MO_GlobalAddress: + CalleeMCOp = + MCIL.LowerSymbolOperand(CalleeMO, + MCIL.GetSymbolFromOperand(CalleeMO)); + break; + } + + // Emit MOV to materialize the target address and the CALL to target. + // This is encoded with 12-13 bytes, depending on which register is used. + unsigned ScratchReg = MI.getOperand(ScratchIdx).getReg(); + if (X86II::isX86_64ExtendedReg(ScratchReg)) + EncodedBytes = 13; + else + EncodedBytes = 12; + + EmitAndCountInstruction( + MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp)); + EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg)); + } + + // Emit padding. + unsigned NumBytes = opers.getMetaOper(PatchPointOpers::NBytesPos).getImm(); + assert(NumBytes >= EncodedBytes && + "Patchpoint can't request size less than the length of a call."); + + EmitNops(*OutStreamer, NumBytes - EncodedBytes, Subtarget->is64Bit(), + getSubtargetInfo()); +} + +// Returns instruction preceding MBBI in MachineFunction. +// If MBBI is the first instruction of the first basic block, returns null. +static MachineBasicBlock::const_iterator +PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) { + const MachineBasicBlock *MBB = MBBI->getParent(); + while (MBBI == MBB->begin()) { + if (MBB == MBB->getParent()->begin()) + return nullptr; + MBB = MBB->getPrevNode(); + MBBI = MBB->end(); + } + return --MBBI; +} + +static const Constant *getConstantFromPool(const MachineInstr &MI, + const MachineOperand &Op) { + if (!Op.isCPI()) + return nullptr; + + ArrayRef<MachineConstantPoolEntry> Constants = + MI.getParent()->getParent()->getConstantPool()->getConstants(); + const MachineConstantPoolEntry &ConstantEntry = + Constants[Op.getIndex()]; + + // Bail if this is a machine constant pool entry, we won't be able to dig out + // anything useful. + if (ConstantEntry.isMachineConstantPoolEntry()) + return nullptr; + + auto *C = dyn_cast<Constant>(ConstantEntry.Val.ConstVal); + assert((!C || ConstantEntry.getType() == C->getType()) && + "Expected a constant of the same type!"); + return C; +} + +static std::string getShuffleComment(const MachineOperand &DstOp, + const MachineOperand &SrcOp, + ArrayRef<int> Mask) { + std::string Comment; + + // Compute the name for a register. This is really goofy because we have + // multiple instruction printers that could (in theory) use different + // names. Fortunately most people use the ATT style (outside of Windows) + // and they actually agree on register naming here. Ultimately, this is + // a comment, and so its OK if it isn't perfect. + auto GetRegisterName = [](unsigned RegNum) -> StringRef { + return X86ATTInstPrinter::getRegisterName(RegNum); + }; + + StringRef DstName = DstOp.isReg() ? GetRegisterName(DstOp.getReg()) : "mem"; + StringRef SrcName = SrcOp.isReg() ? GetRegisterName(SrcOp.getReg()) : "mem"; + + raw_string_ostream CS(Comment); + CS << DstName << " = "; + bool NeedComma = false; + bool InSrc = false; + for (int M : Mask) { + // Wrap up any prior entry... + if (M == SM_SentinelZero && InSrc) { + InSrc = false; + CS << "]"; + } + if (NeedComma) + CS << ","; + else + NeedComma = true; + + // Print this shuffle... + if (M == SM_SentinelZero) { + CS << "zero"; + } else { + if (!InSrc) { + InSrc = true; + CS << SrcName << "["; + } + if (M == SM_SentinelUndef) + CS << "u"; + else + CS << M; + } + } + if (InSrc) + CS << "]"; + CS.flush(); + + return Comment; +} + +void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { + X86MCInstLower MCInstLowering(*MF, *this); + const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo(); + + switch (MI->getOpcode()) { + case TargetOpcode::DBG_VALUE: + llvm_unreachable("Should be handled target independently"); + + // Emit nothing here but a comment if we can. + case X86::Int_MemBarrier: + OutStreamer->emitRawComment("MEMBARRIER"); + return; + + + case X86::EH_RETURN: + case X86::EH_RETURN64: { + // Lower these as normal, but add some comments. + unsigned Reg = MI->getOperand(0).getReg(); + OutStreamer->AddComment(StringRef("eh_return, addr: %") + + X86ATTInstPrinter::getRegisterName(Reg)); + break; + } + case X86::CLEANUPRET: { + // Lower these as normal, but add some comments. + OutStreamer->AddComment("CLEANUPRET"); + break; + } + + case X86::CATCHRET: { + // Lower these as normal, but add some comments. + OutStreamer->AddComment("CATCHRET"); + break; + } + + case X86::TAILJMPr: + case X86::TAILJMPm: + case X86::TAILJMPd: + case X86::TAILJMPr64: + case X86::TAILJMPm64: + case X86::TAILJMPd64: + case X86::TAILJMPr64_REX: + case X86::TAILJMPm64_REX: + case X86::TAILJMPd64_REX: + // Lower these as normal, but add some comments. + OutStreamer->AddComment("TAILCALL"); + break; + + case X86::TLS_addr32: + case X86::TLS_addr64: + case X86::TLS_base_addr32: + case X86::TLS_base_addr64: + return LowerTlsAddr(MCInstLowering, *MI); + + case X86::MOVPC32r: { + // This is a pseudo op for a two instruction sequence with a label, which + // looks like: + // call "L1$pb" + // "L1$pb": + // popl %esi + + // Emit the call. + MCSymbol *PICBase = MF->getPICBaseSymbol(); + // FIXME: We would like an efficient form for this, so we don't have to do a + // lot of extra uniquing. + EmitAndCountInstruction(MCInstBuilder(X86::CALLpcrel32) + .addExpr(MCSymbolRefExpr::create(PICBase, OutContext))); + + const X86FrameLowering* FrameLowering = + MF->getSubtarget<X86Subtarget>().getFrameLowering(); + bool hasFP = FrameLowering->hasFP(*MF); + + // TODO: This is needed only if we require precise CFA. + bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() && + !OutStreamer->getDwarfFrameInfos().back().End; + + int stackGrowth = -RI->getSlotSize(); + + if (HasActiveDwarfFrame && !hasFP) { + OutStreamer->EmitCFIAdjustCfaOffset(-stackGrowth); + } + + // Emit the label. + OutStreamer->EmitLabel(PICBase); + + // popl $reg + EmitAndCountInstruction(MCInstBuilder(X86::POP32r) + .addReg(MI->getOperand(0).getReg())); + + if (HasActiveDwarfFrame && !hasFP) { + OutStreamer->EmitCFIAdjustCfaOffset(stackGrowth); + } + return; + } + + case X86::ADD32ri: { + // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri. + if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS) + break; + + // Okay, we have something like: + // EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL) + + // For this, we want to print something like: + // MYGLOBAL + (. - PICBASE) + // However, we can't generate a ".", so just emit a new label here and refer + // to it. + MCSymbol *DotSym = OutContext.createTempSymbol(); + OutStreamer->EmitLabel(DotSym); + + // Now that we have emitted the label, lower the complex operand expression. + MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2)); + + const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext); + const MCExpr *PICBase = + MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext); + DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext); + + DotExpr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(OpSym,OutContext), + DotExpr, OutContext); + + EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addExpr(DotExpr)); + return; + } + case TargetOpcode::STATEPOINT: + return LowerSTATEPOINT(*MI, MCInstLowering); + + case TargetOpcode::FAULTING_LOAD_OP: + return LowerFAULTING_LOAD_OP(*MI, MCInstLowering); + + case TargetOpcode::STACKMAP: + return LowerSTACKMAP(*MI); + + case TargetOpcode::PATCHPOINT: + return LowerPATCHPOINT(*MI, MCInstLowering); + + case X86::MORESTACK_RET: + EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget))); + return; + + case X86::MORESTACK_RET_RESTORE_R10: + // Return, then restore R10. + EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget))); + EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr) + .addReg(X86::R10) + .addReg(X86::RAX)); + return; + + case X86::SEH_PushReg: + OutStreamer->EmitWinCFIPushReg(RI->getSEHRegNum(MI->getOperand(0).getImm())); + return; + + case X86::SEH_SaveReg: + OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()), + MI->getOperand(1).getImm()); + return; + + case X86::SEH_SaveXMM: + OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()), + MI->getOperand(1).getImm()); + return; + + case X86::SEH_StackAlloc: + OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm()); + return; + + case X86::SEH_SetFrame: + OutStreamer->EmitWinCFISetFrame(RI->getSEHRegNum(MI->getOperand(0).getImm()), + MI->getOperand(1).getImm()); + return; + + case X86::SEH_PushFrame: + OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm()); + return; + + case X86::SEH_EndPrologue: + OutStreamer->EmitWinCFIEndProlog(); + return; + + case X86::SEH_Epilogue: { + MachineBasicBlock::const_iterator MBBI(MI); + // Check if preceded by a call and emit nop if so. + for (MBBI = PrevCrossBBInst(MBBI); MBBI; MBBI = PrevCrossBBInst(MBBI)) { + // Conservatively assume that pseudo instructions don't emit code and keep + // looking for a call. We may emit an unnecessary nop in some cases. + if (!MBBI->isPseudo()) { + if (MBBI->isCall()) + EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); + break; + } + } + return; + } + + // Lower PSHUFB and VPERMILP normally but add a comment if we can find + // a constant shuffle mask. We won't be able to do this at the MC layer + // because the mask isn't an immediate. + case X86::PSHUFBrm: + case X86::VPSHUFBrm: + case X86::VPSHUFBYrm: + case X86::VPSHUFBZ128rm: + case X86::VPSHUFBZ128rmk: + case X86::VPSHUFBZ128rmkz: + case X86::VPSHUFBZ256rm: + case X86::VPSHUFBZ256rmk: + case X86::VPSHUFBZ256rmkz: + case X86::VPSHUFBZrm: + case X86::VPSHUFBZrmk: + case X86::VPSHUFBZrmkz: { + if (!OutStreamer->isVerboseAsm()) + break; + unsigned SrcIdx, MaskIdx; + switch (MI->getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::PSHUFBrm: + case X86::VPSHUFBrm: + case X86::VPSHUFBYrm: + case X86::VPSHUFBZ128rm: + case X86::VPSHUFBZ256rm: + case X86::VPSHUFBZrm: + SrcIdx = 1; MaskIdx = 5; break; + case X86::VPSHUFBZ128rmkz: + case X86::VPSHUFBZ256rmkz: + case X86::VPSHUFBZrmkz: + SrcIdx = 2; MaskIdx = 6; break; + case X86::VPSHUFBZ128rmk: + case X86::VPSHUFBZ256rmk: + case X86::VPSHUFBZrmk: + SrcIdx = 3; MaskIdx = 7; break; + } + + assert(MI->getNumOperands() >= 6 && + "We should always have at least 6 operands!"); + const MachineOperand &DstOp = MI->getOperand(0); + const MachineOperand &SrcOp = MI->getOperand(SrcIdx); + const MachineOperand &MaskOp = MI->getOperand(MaskIdx); + + if (auto *C = getConstantFromPool(*MI, MaskOp)) { + SmallVector<int, 16> Mask; + DecodePSHUFBMask(C, Mask); + if (!Mask.empty()) + OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, Mask)); + } + break; + } + case X86::VPERMILPSrm: + case X86::VPERMILPDrm: + case X86::VPERMILPSYrm: + case X86::VPERMILPDYrm: { + if (!OutStreamer->isVerboseAsm()) + break; + assert(MI->getNumOperands() > 5 && + "We should always have at least 5 operands!"); + const MachineOperand &DstOp = MI->getOperand(0); + const MachineOperand &SrcOp = MI->getOperand(1); + const MachineOperand &MaskOp = MI->getOperand(5); + + unsigned ElSize; + switch (MI->getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VPERMILPSrm: case X86::VPERMILPSYrm: ElSize = 32; break; + case X86::VPERMILPDrm: case X86::VPERMILPDYrm: ElSize = 64; break; + } + + if (auto *C = getConstantFromPool(*MI, MaskOp)) { + SmallVector<int, 16> Mask; + DecodeVPERMILPMask(C, ElSize, Mask); + if (!Mask.empty()) + OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, Mask)); + } + break; + } + +#define MOV_CASE(Prefix, Suffix) \ + case X86::Prefix##MOVAPD##Suffix##rm: \ + case X86::Prefix##MOVAPS##Suffix##rm: \ + case X86::Prefix##MOVUPD##Suffix##rm: \ + case X86::Prefix##MOVUPS##Suffix##rm: \ + case X86::Prefix##MOVDQA##Suffix##rm: \ + case X86::Prefix##MOVDQU##Suffix##rm: + +#define MOV_AVX512_CASE(Suffix) \ + case X86::VMOVDQA64##Suffix##rm: \ + case X86::VMOVDQA32##Suffix##rm: \ + case X86::VMOVDQU64##Suffix##rm: \ + case X86::VMOVDQU32##Suffix##rm: \ + case X86::VMOVDQU16##Suffix##rm: \ + case X86::VMOVDQU8##Suffix##rm: \ + case X86::VMOVAPS##Suffix##rm: \ + case X86::VMOVAPD##Suffix##rm: \ + case X86::VMOVUPS##Suffix##rm: \ + case X86::VMOVUPD##Suffix##rm: + +#define CASE_ALL_MOV_RM() \ + MOV_CASE(, ) /* SSE */ \ + MOV_CASE(V, ) /* AVX-128 */ \ + MOV_CASE(V, Y) /* AVX-256 */ \ + MOV_AVX512_CASE(Z) \ + MOV_AVX512_CASE(Z256) \ + MOV_AVX512_CASE(Z128) + + // For loads from a constant pool to a vector register, print the constant + // loaded. + CASE_ALL_MOV_RM() + if (!OutStreamer->isVerboseAsm()) + break; + if (MI->getNumOperands() > 4) + if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) { + std::string Comment; + raw_string_ostream CS(Comment); + const MachineOperand &DstOp = MI->getOperand(0); + CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = "; + if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) { + CS << "["; + for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements; ++i) { + if (i != 0) + CS << ","; + if (CDS->getElementType()->isIntegerTy()) + CS << CDS->getElementAsInteger(i); + else if (CDS->getElementType()->isFloatTy()) + CS << CDS->getElementAsFloat(i); + else if (CDS->getElementType()->isDoubleTy()) + CS << CDS->getElementAsDouble(i); + else + CS << "?"; + } + CS << "]"; + OutStreamer->AddComment(CS.str()); + } else if (auto *CV = dyn_cast<ConstantVector>(C)) { + CS << "<"; + for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) { + if (i != 0) + CS << ","; + Constant *COp = CV->getOperand(i); + if (isa<UndefValue>(COp)) { + CS << "u"; + } else if (auto *CI = dyn_cast<ConstantInt>(COp)) { + if (CI->getBitWidth() <= 64) { + CS << CI->getZExtValue(); + } else { + // print multi-word constant as (w0,w1) + auto Val = CI->getValue(); + CS << "("; + for (int i = 0, N = Val.getNumWords(); i < N; ++i) { + if (i > 0) + CS << ","; + CS << Val.getRawData()[i]; + } + CS << ")"; + } + } else if (auto *CF = dyn_cast<ConstantFP>(COp)) { + SmallString<32> Str; + CF->getValueAPF().toString(Str); + CS << Str; + } else { + CS << "?"; + } + } + CS << ">"; + OutStreamer->AddComment(CS.str()); + } + } + break; + } + + MCInst TmpInst; + MCInstLowering.Lower(MI, TmpInst); + + // Stackmap shadows cannot include branch targets, so we can count the bytes + // in a call towards the shadow, but must ensure that the no thread returns + // in to the stackmap shadow. The only way to achieve this is if the call + // is at the end of the shadow. + if (MI->isCall()) { + // Count then size of the call towards the shadow + SMShadowTracker.count(TmpInst, getSubtargetInfo()); + // Then flush the shadow so that we fill with nops before the call, not + // after it. + SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo()); + // Then emit the call + OutStreamer->EmitInstruction(TmpInst, getSubtargetInfo()); + return; + } + + EmitAndCountInstruction(TmpInst); +} diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp new file mode 100644 index 0000000..c9e636f --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp @@ -0,0 +1,33 @@ +//===-- X86MachineFunctionInfo.cpp - X86 machine function info ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "X86MachineFunctionInfo.h" +#include "X86RegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +void X86MachineFunctionInfo::anchor() { } + +void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) { + if (!RestoreBasePointerOffset) { + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + MF->getSubtarget().getRegisterInfo()); + unsigned SlotSize = RegInfo->getSlotSize(); + for (const MCPhysReg *CSR = + RegInfo->X86RegisterInfo::getCalleeSavedRegs(MF); + unsigned Reg = *CSR; + ++CSR) + { + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) + RestoreBasePointerOffset -= SlotSize; + } + } +} + diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h new file mode 100644 index 0000000..3a7a98d --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -0,0 +1,167 @@ +//===-- X86MachineFunctionInfo.h - X86 machine function info ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares X86-specific per-machine-function information. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H + +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineValueType.h" +#include <vector> + +namespace llvm { + +/// X86MachineFunctionInfo - This class is derived from MachineFunction and +/// contains private X86 target-specific information for each MachineFunction. +class X86MachineFunctionInfo : public MachineFunctionInfo { + virtual void anchor(); + + /// ForceFramePointer - True if the function is required to use of frame + /// pointer for reasons other than it containing dynamic allocation or + /// that FP eliminatation is turned off. For example, Cygwin main function + /// contains stack pointer re-alignment code which requires FP. + bool ForceFramePointer = false; + + /// RestoreBasePointerOffset - Non-zero if the function has base pointer + /// and makes call to llvm.eh.sjlj.setjmp. When non-zero, the value is a + /// displacement from the frame pointer to a slot where the base pointer + /// is stashed. + signed char RestoreBasePointerOffset = 0; + + /// CalleeSavedFrameSize - Size of the callee-saved register portion of the + /// stack frame in bytes. + unsigned CalleeSavedFrameSize = 0; + + /// BytesToPopOnReturn - Number of bytes function pops on return (in addition + /// to the space used by the return address). + /// Used on windows platform for stdcall & fastcall name decoration + unsigned BytesToPopOnReturn = 0; + + /// ReturnAddrIndex - FrameIndex for return slot. + int ReturnAddrIndex = 0; + + /// \brief FrameIndex for return slot. + int FrameAddrIndex = 0; + + /// TailCallReturnAddrDelta - The number of bytes by which return address + /// stack slot is moved as the result of tail call optimization. + int TailCallReturnAddrDelta = 0; + + /// SRetReturnReg - Some subtargets require that sret lowering includes + /// returning the value of the returned struct in a register. This field + /// holds the virtual register into which the sret argument is passed. + unsigned SRetReturnReg = 0; + + /// GlobalBaseReg - keeps track of the virtual register initialized for + /// use as the global base register. This is used for PIC in some PIC + /// relocation models. + unsigned GlobalBaseReg = 0; + + /// VarArgsFrameIndex - FrameIndex for start of varargs area. + int VarArgsFrameIndex = 0; + /// RegSaveFrameIndex - X86-64 vararg func register save area. + int RegSaveFrameIndex = 0; + /// VarArgsGPOffset - X86-64 vararg func int reg offset. + unsigned VarArgsGPOffset = 0; + /// VarArgsFPOffset - X86-64 vararg func fp reg offset. + unsigned VarArgsFPOffset = 0; + /// ArgumentStackSize - The number of bytes on stack consumed by the arguments + /// being passed on the stack. + unsigned ArgumentStackSize = 0; + /// NumLocalDynamics - Number of local-dynamic TLS accesses. + unsigned NumLocalDynamics = 0; + /// HasPushSequences - Keeps track of whether this function uses sequences + /// of pushes to pass function parameters. + bool HasPushSequences = false; + + /// True if the function recovers from an SEH exception, and therefore needs + /// to spill and restore the frame pointer. + bool HasSEHFramePtrSave = false; + + /// The frame index of a stack object containing the original frame pointer + /// used to address arguments in a function using a base pointer. + int SEHFramePtrSaveIndex = 0; + +private: + /// ForwardedMustTailRegParms - A list of virtual and physical registers + /// that must be forwarded to every musttail call. + SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms; + +public: + X86MachineFunctionInfo() = default; + + explicit X86MachineFunctionInfo(MachineFunction &MF) {} + + bool getForceFramePointer() const { return ForceFramePointer;} + void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } + + bool getHasPushSequences() const { return HasPushSequences; } + void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; } + + bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; } + void setRestoreBasePointer(const MachineFunction *MF); + int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; } + + unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; } + void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; } + + unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; } + void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;} + + int getRAIndex() const { return ReturnAddrIndex; } + void setRAIndex(int Index) { ReturnAddrIndex = Index; } + + int getFAIndex() const { return FrameAddrIndex; } + void setFAIndex(int Index) { FrameAddrIndex = Index; } + + int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; } + void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;} + + unsigned getSRetReturnReg() const { return SRetReturnReg; } + void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; } + + unsigned getGlobalBaseReg() const { return GlobalBaseReg; } + void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; } + + int getVarArgsFrameIndex() const { return VarArgsFrameIndex; } + void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; } + + int getRegSaveFrameIndex() const { return RegSaveFrameIndex; } + void setRegSaveFrameIndex(int Idx) { RegSaveFrameIndex = Idx; } + + unsigned getVarArgsGPOffset() const { return VarArgsGPOffset; } + void setVarArgsGPOffset(unsigned Offset) { VarArgsGPOffset = Offset; } + + unsigned getVarArgsFPOffset() const { return VarArgsFPOffset; } + void setVarArgsFPOffset(unsigned Offset) { VarArgsFPOffset = Offset; } + + unsigned getArgumentStackSize() const { return ArgumentStackSize; } + void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; } + + unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; } + void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; } + + bool getHasSEHFramePtrSave() const { return HasSEHFramePtrSave; } + void setHasSEHFramePtrSave(bool V) { HasSEHFramePtrSave = V; } + + int getSEHFramePtrSaveIndex() const { return SEHFramePtrSaveIndex; } + void setSEHFramePtrSaveIndex(int Index) { SEHFramePtrSaveIndex = Index; } + + SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() { + return ForwardedMustTailRegParms; + } +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp new file mode 100644 index 0000000..58020d9 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp @@ -0,0 +1,326 @@ +//===-- X86OptimizeLEAs.cpp - optimize usage of LEA instructions ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass that performs some optimizations with LEA +// instructions in order to improve code size. +// Currently, it does one thing: +// 1) Address calculations in load and store instructions are replaced by +// existing LEA def registers where possible. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-optimize-LEAs" + +static cl::opt<bool> EnableX86LEAOpt("enable-x86-lea-opt", cl::Hidden, + cl::desc("X86: Enable LEA optimizations."), + cl::init(false)); + +STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions"); + +namespace { +class OptimizeLEAPass : public MachineFunctionPass { +public: + OptimizeLEAPass() : MachineFunctionPass(ID) {} + + const char *getPassName() const override { return "X86 LEA Optimize"; } + + /// \brief Loop over all of the basic blocks, replacing address + /// calculations in load and store instructions, if it's already + /// been calculated by LEA. Also, remove redundant LEAs. + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + /// \brief Returns a distance between two instructions inside one basic block. + /// Negative result means, that instructions occur in reverse order. + int calcInstrDist(const MachineInstr &First, const MachineInstr &Last); + + /// \brief Choose the best \p LEA instruction from the \p List to replace + /// address calculation in \p MI instruction. Return the address displacement + /// and the distance between \p MI and the choosen \p LEA in \p AddrDispShift + /// and \p Dist. + bool chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List, + const MachineInstr &MI, MachineInstr *&LEA, + int64_t &AddrDispShift, int &Dist); + + /// \brief Returns true if two machine operand are identical and they are not + /// physical registers. + bool isIdenticalOp(const MachineOperand &MO1, const MachineOperand &MO2); + + /// \brief Returns true if the instruction is LEA. + bool isLEA(const MachineInstr &MI); + + /// \brief Returns true if two instructions have memory operands that only + /// differ by displacement. The numbers of the first memory operands for both + /// instructions are specified through \p N1 and \p N2. The address + /// displacement is returned through AddrDispShift. + bool isSimilarMemOp(const MachineInstr &MI1, unsigned N1, + const MachineInstr &MI2, unsigned N2, + int64_t &AddrDispShift); + + /// \brief Find all LEA instructions in the basic block. + void findLEAs(const MachineBasicBlock &MBB, + SmallVectorImpl<MachineInstr *> &List); + + /// \brief Removes redundant address calculations. + bool removeRedundantAddrCalc(const SmallVectorImpl<MachineInstr *> &List); + + MachineRegisterInfo *MRI; + const X86InstrInfo *TII; + const X86RegisterInfo *TRI; + + static char ID; +}; +char OptimizeLEAPass::ID = 0; +} + +FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); } + +int OptimizeLEAPass::calcInstrDist(const MachineInstr &First, + const MachineInstr &Last) { + const MachineBasicBlock *MBB = First.getParent(); + + // Both instructions must be in the same basic block. + assert(Last.getParent() == MBB && + "Instructions are in different basic blocks"); + + return std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&Last)) - + std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&First)); +} + +// Find the best LEA instruction in the List to replace address recalculation in +// MI. Such LEA must meet these requirements: +// 1) The address calculated by the LEA differs only by the displacement from +// the address used in MI. +// 2) The register class of the definition of the LEA is compatible with the +// register class of the address base register of MI. +// 3) Displacement of the new memory operand should fit in 1 byte if possible. +// 4) The LEA should be as close to MI as possible, and prior to it if +// possible. +bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List, + const MachineInstr &MI, MachineInstr *&LEA, + int64_t &AddrDispShift, int &Dist) { + const MachineFunction *MF = MI.getParent()->getParent(); + const MCInstrDesc &Desc = MI.getDesc(); + int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode()) + + X86II::getOperandBias(Desc); + + LEA = nullptr; + + // Loop over all LEA instructions. + for (auto DefMI : List) { + int64_t AddrDispShiftTemp = 0; + + // Compare instructions memory operands. + if (!isSimilarMemOp(MI, MemOpNo, *DefMI, 1, AddrDispShiftTemp)) + continue; + + // Make sure address displacement fits 4 bytes. + if (!isInt<32>(AddrDispShiftTemp)) + continue; + + // Check that LEA def register can be used as MI address base. Some + // instructions can use a limited set of registers as address base, for + // example MOV8mr_NOREX. We could constrain the register class of the LEA + // def to suit MI, however since this case is very rare and hard to + // reproduce in a test it's just more reliable to skip the LEA. + if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI, *MF) != + MRI->getRegClass(DefMI->getOperand(0).getReg())) + continue; + + // Choose the closest LEA instruction from the list, prior to MI if + // possible. Note that we took into account resulting address displacement + // as well. Also note that the list is sorted by the order in which the LEAs + // occur, so the break condition is pretty simple. + int DistTemp = calcInstrDist(*DefMI, MI); + assert(DistTemp != 0 && + "The distance between two different instructions cannot be zero"); + if (DistTemp > 0 || LEA == nullptr) { + // Do not update return LEA, if the current one provides a displacement + // which fits in 1 byte, while the new candidate does not. + if (LEA != nullptr && !isInt<8>(AddrDispShiftTemp) && + isInt<8>(AddrDispShift)) + continue; + + LEA = DefMI; + AddrDispShift = AddrDispShiftTemp; + Dist = DistTemp; + } + + // FIXME: Maybe we should not always stop at the first LEA after MI. + if (DistTemp < 0) + break; + } + + return LEA != nullptr; +} + +bool OptimizeLEAPass::isIdenticalOp(const MachineOperand &MO1, + const MachineOperand &MO2) { + return MO1.isIdenticalTo(MO2) && + (!MO1.isReg() || + !TargetRegisterInfo::isPhysicalRegister(MO1.getReg())); +} + +bool OptimizeLEAPass::isLEA(const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + return Opcode == X86::LEA16r || Opcode == X86::LEA32r || + Opcode == X86::LEA64r || Opcode == X86::LEA64_32r; +} + +// Check if MI1 and MI2 have memory operands which represent addresses that +// differ only by displacement. +bool OptimizeLEAPass::isSimilarMemOp(const MachineInstr &MI1, unsigned N1, + const MachineInstr &MI2, unsigned N2, + int64_t &AddrDispShift) { + // Address base, scale, index and segment operands must be identical. + static const int IdenticalOpNums[] = {X86::AddrBaseReg, X86::AddrScaleAmt, + X86::AddrIndexReg, X86::AddrSegmentReg}; + for (auto &N : IdenticalOpNums) + if (!isIdenticalOp(MI1.getOperand(N1 + N), MI2.getOperand(N2 + N))) + return false; + + // Address displacement operands may differ by a constant. + const MachineOperand *Op1 = &MI1.getOperand(N1 + X86::AddrDisp); + const MachineOperand *Op2 = &MI2.getOperand(N2 + X86::AddrDisp); + if (!isIdenticalOp(*Op1, *Op2)) { + if (Op1->isImm() && Op2->isImm()) + AddrDispShift = Op1->getImm() - Op2->getImm(); + else if (Op1->isGlobal() && Op2->isGlobal() && + Op1->getGlobal() == Op2->getGlobal()) + AddrDispShift = Op1->getOffset() - Op2->getOffset(); + else + return false; + } + + return true; +} + +void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, + SmallVectorImpl<MachineInstr *> &List) { + for (auto &MI : MBB) { + if (isLEA(MI)) + List.push_back(const_cast<MachineInstr *>(&MI)); + } +} + +// Try to find load and store instructions which recalculate addresses already +// calculated by some LEA and replace their memory operands with its def +// register. +bool OptimizeLEAPass::removeRedundantAddrCalc( + const SmallVectorImpl<MachineInstr *> &List) { + bool Changed = false; + + assert(List.size() > 0); + MachineBasicBlock *MBB = List[0]->getParent(); + + // Process all instructions in basic block. + for (auto I = MBB->begin(), E = MBB->end(); I != E;) { + MachineInstr &MI = *I++; + unsigned Opcode = MI.getOpcode(); + + // Instruction must be load or store. + if (!MI.mayLoadOrStore()) + continue; + + // Get the number of the first memory operand. + const MCInstrDesc &Desc = MI.getDesc(); + int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, Opcode); + + // If instruction has no memory operand - skip it. + if (MemOpNo < 0) + continue; + + MemOpNo += X86II::getOperandBias(Desc); + + // Get the best LEA instruction to replace address calculation. + MachineInstr *DefMI; + int64_t AddrDispShift; + int Dist; + if (!chooseBestLEA(List, MI, DefMI, AddrDispShift, Dist)) + continue; + + // If LEA occurs before current instruction, we can freely replace + // the instruction. If LEA occurs after, we can lift LEA above the + // instruction and this way to be able to replace it. Since LEA and the + // instruction have similar memory operands (thus, the same def + // instructions for these operands), we can always do that, without + // worries of using registers before their defs. + if (Dist < 0) { + DefMI->removeFromParent(); + MBB->insert(MachineBasicBlock::iterator(&MI), DefMI); + } + + // Since we can possibly extend register lifetime, clear kill flags. + MRI->clearKillFlags(DefMI->getOperand(0).getReg()); + + ++NumSubstLEAs; + DEBUG(dbgs() << "OptimizeLEAs: Candidate to replace: "; MI.dump();); + + // Change instruction operands. + MI.getOperand(MemOpNo + X86::AddrBaseReg) + .ChangeToRegister(DefMI->getOperand(0).getReg(), false); + MI.getOperand(MemOpNo + X86::AddrScaleAmt).ChangeToImmediate(1); + MI.getOperand(MemOpNo + X86::AddrIndexReg) + .ChangeToRegister(X86::NoRegister, false); + MI.getOperand(MemOpNo + X86::AddrDisp).ChangeToImmediate(AddrDispShift); + MI.getOperand(MemOpNo + X86::AddrSegmentReg) + .ChangeToRegister(X86::NoRegister, false); + + DEBUG(dbgs() << "OptimizeLEAs: Replaced by: "; MI.dump();); + + Changed = true; + } + + return Changed; +} + +bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { + bool Changed = false; + + // Perform this optimization only if we care about code size. + if (!EnableX86LEAOpt || !MF.getFunction()->optForSize()) + return false; + + MRI = &MF.getRegInfo(); + TII = MF.getSubtarget<X86Subtarget>().getInstrInfo(); + TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo(); + + // Process all basic blocks. + for (auto &MBB : MF) { + SmallVector<MachineInstr *, 16> LEAs; + + // Find all LEA instructions in basic block. + findLEAs(MBB, LEAs); + + // If current basic block has no LEAs, move on to the next one. + if (LEAs.empty()) + continue; + + // Remove redundant address calculations. + Changed |= removeRedundantAddrCalc(LEAs); + } + + return Changed; +} diff --git a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp new file mode 100644 index 0000000..0f425e2 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp @@ -0,0 +1,213 @@ +//===-------- X86PadShortFunction.cpp - pad short functions -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass which will pad short functions to prevent +// a stall if a function returns before the return address is ready. This +// is needed for some Intel Atom processors. +// +//===----------------------------------------------------------------------===// + +#include <algorithm> + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-pad-short-functions" + +STATISTIC(NumBBsPadded, "Number of basic blocks padded"); + +namespace { + struct VisitedBBInfo { + // HasReturn - Whether the BB contains a return instruction + bool HasReturn; + + // Cycles - Number of cycles until return if HasReturn is true, otherwise + // number of cycles until end of the BB + unsigned int Cycles; + + VisitedBBInfo() : HasReturn(false), Cycles(0) {} + VisitedBBInfo(bool HasReturn, unsigned int Cycles) + : HasReturn(HasReturn), Cycles(Cycles) {} + }; + + struct PadShortFunc : public MachineFunctionPass { + static char ID; + PadShortFunc() : MachineFunctionPass(ID) + , Threshold(4), STI(nullptr), TII(nullptr) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "X86 Atom pad short functions"; + } + + private: + void findReturns(MachineBasicBlock *MBB, + unsigned int Cycles = 0); + + bool cyclesUntilReturn(MachineBasicBlock *MBB, + unsigned int &Cycles); + + void addPadding(MachineBasicBlock *MBB, + MachineBasicBlock::iterator &MBBI, + unsigned int NOOPsToAdd); + + const unsigned int Threshold; + + // ReturnBBs - Maps basic blocks that return to the minimum number of + // cycles until the return, starting from the entry block. + DenseMap<MachineBasicBlock*, unsigned int> ReturnBBs; + + // VisitedBBs - Cache of previously visited BBs. + DenseMap<MachineBasicBlock*, VisitedBBInfo> VisitedBBs; + + const X86Subtarget *STI; + const TargetInstrInfo *TII; + }; + + char PadShortFunc::ID = 0; +} + +FunctionPass *llvm::createX86PadShortFunctions() { + return new PadShortFunc(); +} + +/// runOnMachineFunction - Loop over all of the basic blocks, inserting +/// NOOP instructions before early exits. +bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { + if (MF.getFunction()->optForSize()) { + return false; + } + + STI = &MF.getSubtarget<X86Subtarget>(); + if (!STI->padShortFunctions()) + return false; + + TII = STI->getInstrInfo(); + + // Search through basic blocks and mark the ones that have early returns + ReturnBBs.clear(); + VisitedBBs.clear(); + findReturns(&MF.front()); + + bool MadeChange = false; + + MachineBasicBlock *MBB; + unsigned int Cycles = 0; + + // Pad the identified basic blocks with NOOPs + for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin(); + I != ReturnBBs.end(); ++I) { + MBB = I->first; + Cycles = I->second; + + if (Cycles < Threshold) { + // BB ends in a return. Skip over any DBG_VALUE instructions + // trailing the terminator. + assert(MBB->size() > 0 && + "Basic block should contain at least a RET but is empty"); + MachineBasicBlock::iterator ReturnLoc = --MBB->end(); + + while (ReturnLoc->isDebugValue()) + --ReturnLoc; + assert(ReturnLoc->isReturn() && !ReturnLoc->isCall() && + "Basic block does not end with RET"); + + addPadding(MBB, ReturnLoc, Threshold - Cycles); + NumBBsPadded++; + MadeChange = true; + } + } + + return MadeChange; +} + +/// findReturn - Starting at MBB, follow control flow and add all +/// basic blocks that contain a return to ReturnBBs. +void PadShortFunc::findReturns(MachineBasicBlock *MBB, unsigned int Cycles) { + // If this BB has a return, note how many cycles it takes to get there. + bool hasReturn = cyclesUntilReturn(MBB, Cycles); + if (Cycles >= Threshold) + return; + + if (hasReturn) { + ReturnBBs[MBB] = std::max(ReturnBBs[MBB], Cycles); + return; + } + + // Follow branches in BB and look for returns + for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(); + I != MBB->succ_end(); ++I) { + if (*I == MBB) + continue; + findReturns(*I, Cycles); + } +} + +/// cyclesUntilReturn - return true if the MBB has a return instruction, +/// and return false otherwise. +/// Cycles will be incremented by the number of cycles taken to reach the +/// return or the end of the BB, whichever occurs first. +bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB, + unsigned int &Cycles) { + // Return cached result if BB was previously visited + DenseMap<MachineBasicBlock*, VisitedBBInfo>::iterator it + = VisitedBBs.find(MBB); + if (it != VisitedBBs.end()) { + VisitedBBInfo BBInfo = it->second; + Cycles += BBInfo.Cycles; + return BBInfo.HasReturn; + } + + unsigned int CyclesToEnd = 0; + + for (MachineBasicBlock::iterator MBBI = MBB->begin(); + MBBI != MBB->end(); ++MBBI) { + MachineInstr *MI = MBBI; + // Mark basic blocks with a return instruction. Calls to other + // functions do not count because the called function will be padded, + // if necessary. + if (MI->isReturn() && !MI->isCall()) { + VisitedBBs[MBB] = VisitedBBInfo(true, CyclesToEnd); + Cycles += CyclesToEnd; + return true; + } + + CyclesToEnd += TII->getInstrLatency(STI->getInstrItineraryData(), MI); + } + + VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd); + Cycles += CyclesToEnd; + return false; +} + +/// addPadding - Add the given number of NOOP instructions to the function +/// just prior to the return at MBBI +void PadShortFunc::addPadding(MachineBasicBlock *MBB, + MachineBasicBlock::iterator &MBBI, + unsigned int NOOPsToAdd) { + DebugLoc DL = MBBI->getDebugLoc(); + + while (NOOPsToAdd-- > 0) { + BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP)); + BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP)); + } +} diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp new file mode 100644 index 0000000..5840443 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -0,0 +1,639 @@ +//===-- X86RegisterInfo.cpp - X86 Register Information --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the TargetRegisterInfo class. +// This file is responsible for the frame pointer elimination optimization +// on X86. +// +//===----------------------------------------------------------------------===// + +#include "X86RegisterInfo.h" +#include "X86FrameLowering.h" +#include "X86InstrBuilder.h" +#include "X86MachineFunctionInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Type.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" + +using namespace llvm; + +#define GET_REGINFO_TARGET_DESC +#include "X86GenRegisterInfo.inc" + +static cl::opt<bool> +EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true), + cl::desc("Enable use of a base pointer for complex stack frames")); + +X86RegisterInfo::X86RegisterInfo(const Triple &TT) + : X86GenRegisterInfo((TT.isArch64Bit() ? X86::RIP : X86::EIP), + X86_MC::getDwarfRegFlavour(TT, false), + X86_MC::getDwarfRegFlavour(TT, true), + (TT.isArch64Bit() ? X86::RIP : X86::EIP)) { + X86_MC::InitLLVM2SEHRegisterMapping(this); + + // Cache some information. + Is64Bit = TT.isArch64Bit(); + IsWin64 = Is64Bit && TT.isOSWindows(); + + // Use a callee-saved register as the base pointer. These registers must + // not conflict with any ABI requirements. For example, in 32-bit mode PIC + // requires GOT in the EBX register before function calls via PLT GOT pointer. + if (Is64Bit) { + SlotSize = 8; + // This matches the simplified 32-bit pointer code in the data layout + // computation. + // FIXME: Should use the data layout? + bool Use64BitReg = TT.getEnvironment() != Triple::GNUX32; + StackPtr = Use64BitReg ? X86::RSP : X86::ESP; + FramePtr = Use64BitReg ? X86::RBP : X86::EBP; + BasePtr = Use64BitReg ? X86::RBX : X86::EBX; + } else { + SlotSize = 4; + StackPtr = X86::ESP; + FramePtr = X86::EBP; + BasePtr = X86::ESI; + } +} + +bool +X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { + // ExeDepsFixer and PostRAScheduler require liveness. + return true; +} + +int +X86RegisterInfo::getSEHRegNum(unsigned i) const { + return getEncodingValue(i); +} + +const TargetRegisterClass * +X86RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC, + unsigned Idx) const { + // The sub_8bit sub-register index is more constrained in 32-bit mode. + // It behaves just like the sub_8bit_hi index. + if (!Is64Bit && Idx == X86::sub_8bit) + Idx = X86::sub_8bit_hi; + + // Forward to TableGen's default version. + return X86GenRegisterInfo::getSubClassWithSubReg(RC, Idx); +} + +const TargetRegisterClass * +X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, + const TargetRegisterClass *B, + unsigned SubIdx) const { + // The sub_8bit sub-register index is more constrained in 32-bit mode. + if (!Is64Bit && SubIdx == X86::sub_8bit) { + A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi); + if (!A) + return nullptr; + } + return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx); +} + +const TargetRegisterClass * +X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const { + // Don't allow super-classes of GR8_NOREX. This class is only used after + // extracting sub_8bit_hi sub-registers. The H sub-registers cannot be copied + // to the full GR8 register class in 64-bit mode, so we cannot allow the + // reigster class inflation. + // + // The GR8_NOREX class is always used in a way that won't be constrained to a + // sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the + // full GR8 class. + if (RC == &X86::GR8_NOREXRegClass) + return RC; + + const TargetRegisterClass *Super = RC; + TargetRegisterClass::sc_iterator I = RC->getSuperClasses(); + do { + switch (Super->getID()) { + case X86::GR8RegClassID: + case X86::GR16RegClassID: + case X86::GR32RegClassID: + case X86::GR64RegClassID: + case X86::FR32RegClassID: + case X86::FR64RegClassID: + case X86::RFP32RegClassID: + case X86::RFP64RegClassID: + case X86::RFP80RegClassID: + case X86::VR128RegClassID: + case X86::VR256RegClassID: + // Don't return a super-class that would shrink the spill size. + // That can happen with the vector and float classes. + if (Super->getSize() == RC->getSize()) + return Super; + } + Super = *I++; + } while (Super); + return RC; +} + +const TargetRegisterClass * +X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, + unsigned Kind) const { + const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); + switch (Kind) { + default: llvm_unreachable("Unexpected Kind in getPointerRegClass!"); + case 0: // Normal GPRs. + if (Subtarget.isTarget64BitLP64()) + return &X86::GR64RegClass; + return &X86::GR32RegClass; + case 1: // Normal GPRs except the stack pointer (for encoding reasons). + if (Subtarget.isTarget64BitLP64()) + return &X86::GR64_NOSPRegClass; + return &X86::GR32_NOSPRegClass; + case 2: // NOREX GPRs. + if (Subtarget.isTarget64BitLP64()) + return &X86::GR64_NOREXRegClass; + return &X86::GR32_NOREXRegClass; + case 3: // NOREX GPRs except the stack pointer (for encoding reasons). + if (Subtarget.isTarget64BitLP64()) + return &X86::GR64_NOREX_NOSPRegClass; + return &X86::GR32_NOREX_NOSPRegClass; + case 4: // Available for tailcall (not callee-saved GPRs). + return getGPRsForTailCall(MF); + } +} + +const TargetRegisterClass * +X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const { + const Function *F = MF.getFunction(); + if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64)) + return &X86::GR64_TCW64RegClass; + else if (Is64Bit) + return &X86::GR64_TCRegClass; + + bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false); + if (hasHipeCC) + return &X86::GR32RegClass; + return &X86::GR32_TCRegClass; +} + +const TargetRegisterClass * +X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { + if (RC == &X86::CCRRegClass) { + if (Is64Bit) + return &X86::GR64RegClass; + else + return &X86::GR32RegClass; + } + return RC; +} + +unsigned +X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const { + const X86FrameLowering *TFI = getFrameLowering(MF); + + unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0; + switch (RC->getID()) { + default: + return 0; + case X86::GR32RegClassID: + return 4 - FPDiff; + case X86::GR64RegClassID: + return 12 - FPDiff; + case X86::VR128RegClassID: + return Is64Bit ? 10 : 4; + case X86::VR64RegClassID: + return 4; + } +} + +const MCPhysReg * +X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + const X86Subtarget &Subtarget = MF->getSubtarget<X86Subtarget>(); + bool HasSSE = Subtarget.hasSSE1(); + bool HasAVX = Subtarget.hasAVX(); + bool HasAVX512 = Subtarget.hasAVX512(); + bool CallsEHReturn = MF->getMMI().callsEHReturn(); + + assert(MF && "MachineFunction required"); + switch (MF->getFunction()->getCallingConv()) { + case CallingConv::GHC: + case CallingConv::HiPE: + return CSR_NoRegs_SaveList; + case CallingConv::AnyReg: + if (HasAVX) + return CSR_64_AllRegs_AVX_SaveList; + return CSR_64_AllRegs_SaveList; + case CallingConv::PreserveMost: + return CSR_64_RT_MostRegs_SaveList; + case CallingConv::PreserveAll: + if (HasAVX) + return CSR_64_RT_AllRegs_AVX_SaveList; + return CSR_64_RT_AllRegs_SaveList; + case CallingConv::CXX_FAST_TLS: + if (Is64Bit) + return CSR_64_TLS_Darwin_SaveList; + break; + case CallingConv::Intel_OCL_BI: { + if (HasAVX512 && IsWin64) + return CSR_Win64_Intel_OCL_BI_AVX512_SaveList; + if (HasAVX512 && Is64Bit) + return CSR_64_Intel_OCL_BI_AVX512_SaveList; + if (HasAVX && IsWin64) + return CSR_Win64_Intel_OCL_BI_AVX_SaveList; + if (HasAVX && Is64Bit) + return CSR_64_Intel_OCL_BI_AVX_SaveList; + if (!HasAVX && !IsWin64 && Is64Bit) + return CSR_64_Intel_OCL_BI_SaveList; + break; + } + case CallingConv::HHVM: + return CSR_64_HHVM_SaveList; + case CallingConv::Cold: + if (Is64Bit) + return CSR_64_MostRegs_SaveList; + break; + case CallingConv::X86_64_Win64: + return CSR_Win64_SaveList; + case CallingConv::X86_64_SysV: + if (CallsEHReturn) + return CSR_64EHRet_SaveList; + return CSR_64_SaveList; + case CallingConv::X86_INTR: + if (Is64Bit) { + if (HasAVX) + return CSR_64_AllRegs_AVX_SaveList; + else + return CSR_64_AllRegs_SaveList; + } else { + if (HasSSE) + return CSR_32_AllRegs_SSE_SaveList; + else + return CSR_32_AllRegs_SaveList; + } + default: + break; + } + + if (Is64Bit) { + if (IsWin64) + return CSR_Win64_SaveList; + if (CallsEHReturn) + return CSR_64EHRet_SaveList; + return CSR_64_SaveList; + } + if (CallsEHReturn) + return CSR_32EHRet_SaveList; + return CSR_32_SaveList; +} + +const uint32_t * +X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID CC) const { + const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); + bool HasSSE = Subtarget.hasSSE1(); + bool HasAVX = Subtarget.hasAVX(); + bool HasAVX512 = Subtarget.hasAVX512(); + + switch (CC) { + case CallingConv::GHC: + case CallingConv::HiPE: + return CSR_NoRegs_RegMask; + case CallingConv::AnyReg: + if (HasAVX) + return CSR_64_AllRegs_AVX_RegMask; + return CSR_64_AllRegs_RegMask; + case CallingConv::PreserveMost: + return CSR_64_RT_MostRegs_RegMask; + case CallingConv::PreserveAll: + if (HasAVX) + return CSR_64_RT_AllRegs_AVX_RegMask; + return CSR_64_RT_AllRegs_RegMask; + case CallingConv::CXX_FAST_TLS: + if (Is64Bit) + return CSR_64_TLS_Darwin_RegMask; + break; + case CallingConv::Intel_OCL_BI: { + if (HasAVX512 && IsWin64) + return CSR_Win64_Intel_OCL_BI_AVX512_RegMask; + if (HasAVX512 && Is64Bit) + return CSR_64_Intel_OCL_BI_AVX512_RegMask; + if (HasAVX && IsWin64) + return CSR_Win64_Intel_OCL_BI_AVX_RegMask; + if (HasAVX && Is64Bit) + return CSR_64_Intel_OCL_BI_AVX_RegMask; + if (!HasAVX && !IsWin64 && Is64Bit) + return CSR_64_Intel_OCL_BI_RegMask; + break; + } + case CallingConv::HHVM: + return CSR_64_HHVM_RegMask; + case CallingConv::Cold: + if (Is64Bit) + return CSR_64_MostRegs_RegMask; + break; + case CallingConv::X86_64_Win64: + return CSR_Win64_RegMask; + case CallingConv::X86_64_SysV: + return CSR_64_RegMask; + case CallingConv::X86_INTR: + if (Is64Bit) { + if (HasAVX) + return CSR_64_AllRegs_AVX_RegMask; + else + return CSR_64_AllRegs_RegMask; + } else { + if (HasSSE) + return CSR_32_AllRegs_SSE_RegMask; + else + return CSR_32_AllRegs_RegMask; + } + default: + break; + } + + // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check + // callsEHReturn(). + if (Is64Bit) { + if (IsWin64) + return CSR_Win64_RegMask; + return CSR_64_RegMask; + } + return CSR_32_RegMask; +} + +const uint32_t* +X86RegisterInfo::getNoPreservedMask() const { + return CSR_NoRegs_RegMask; +} + +const uint32_t *X86RegisterInfo::getDarwinTLSCallPreservedMask() const { + return CSR_64_TLS_Darwin_RegMask; +} + +BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + const X86FrameLowering *TFI = getFrameLowering(MF); + + // Set the stack-pointer register and its aliases as reserved. + for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid(); + ++I) + Reserved.set(*I); + + // Set the instruction pointer register and its aliases as reserved. + for (MCSubRegIterator I(X86::RIP, this, /*IncludeSelf=*/true); I.isValid(); + ++I) + Reserved.set(*I); + + // Set the frame-pointer register and its aliases as reserved if needed. + if (TFI->hasFP(MF)) { + for (MCSubRegIterator I(X86::RBP, this, /*IncludeSelf=*/true); I.isValid(); + ++I) + Reserved.set(*I); + } + + // Set the base-pointer register and its aliases as reserved if needed. + if (hasBasePointer(MF)) { + CallingConv::ID CC = MF.getFunction()->getCallingConv(); + const uint32_t *RegMask = getCallPreservedMask(MF, CC); + if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister())) + report_fatal_error( + "Stack realignment in presence of dynamic allocas is not supported with" + "this calling convention."); + + unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), 64); + for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true); + I.isValid(); ++I) + Reserved.set(*I); + } + + // Mark the segment registers as reserved. + Reserved.set(X86::CS); + Reserved.set(X86::SS); + Reserved.set(X86::DS); + Reserved.set(X86::ES); + Reserved.set(X86::FS); + Reserved.set(X86::GS); + + // Mark the floating point stack registers as reserved. + for (unsigned n = 0; n != 8; ++n) + Reserved.set(X86::ST0 + n); + + // Reserve the registers that only exist in 64-bit mode. + if (!Is64Bit) { + // These 8-bit registers are part of the x86-64 extension even though their + // super-registers are old 32-bits. + Reserved.set(X86::SIL); + Reserved.set(X86::DIL); + Reserved.set(X86::BPL); + Reserved.set(X86::SPL); + + for (unsigned n = 0; n != 8; ++n) { + // R8, R9, ... + for (MCRegAliasIterator AI(X86::R8 + n, this, true); AI.isValid(); ++AI) + Reserved.set(*AI); + + // XMM8, XMM9, ... + for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI) + Reserved.set(*AI); + } + } + if (!Is64Bit || !MF.getSubtarget<X86Subtarget>().hasAVX512()) { + for (unsigned n = 16; n != 32; ++n) { + for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI) + Reserved.set(*AI); + } + } + + return Reserved; +} + +void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { + // Check if the EFLAGS register is marked as live-out. This shouldn't happen, + // because the calling convention defines the EFLAGS register as NOT + // preserved. + // + // Unfortunatelly the EFLAGS show up as live-out after branch folding. Adding + // an assert to track this and clear the register afterwards to avoid + // unnecessary crashes during release builds. + assert(!(Mask[X86::EFLAGS / 32] & (1U << (X86::EFLAGS % 32))) && + "EFLAGS are not live-out from a patchpoint."); + + // Also clean other registers that don't need preserving (IP). + for (auto Reg : {X86::EFLAGS, X86::RIP, X86::EIP, X86::IP}) + Mask[Reg / 32] &= ~(1U << (Reg % 32)); +} + +//===----------------------------------------------------------------------===// +// Stack Frame Processing methods +//===----------------------------------------------------------------------===// + +static bool CantUseSP(const MachineFrameInfo *MFI) { + return MFI->hasVarSizedObjects() || MFI->hasOpaqueSPAdjustment(); +} + +bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + if (!EnableBasePointer) + return false; + + // When we need stack realignment, we can't address the stack from the frame + // pointer. When we have dynamic allocas or stack-adjusting inline asm, we + // can't address variables from the stack pointer. MS inline asm can + // reference locals while also adjusting the stack pointer. When we can't + // use both the SP and the FP, we need a separate base pointer register. + bool CantUseFP = needsStackRealignment(MF); + return CantUseFP && CantUseSP(MFI); +} + +bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { + if (!TargetRegisterInfo::canRealignStack(MF)) + return false; + + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + + // Stack realignment requires a frame pointer. If we already started + // register allocation with frame pointer elimination, it is too late now. + if (!MRI->canReserveReg(FramePtr)) + return false; + + // If a base pointer is necessary. Check that it isn't too late to reserve + // it. + if (CantUseSP(MFI)) + return MRI->canReserveReg(BasePtr); + return true; +} + +bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, + unsigned Reg, int &FrameIdx) const { + // Since X86 defines assignCalleeSavedSpillSlots which always return true + // this function neither used nor tested. + llvm_unreachable("Unused function on X86. Otherwise need a test case."); +} + +void +X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS) const { + MachineInstr &MI = *II; + MachineFunction &MF = *MI.getParent()->getParent(); + const X86FrameLowering *TFI = getFrameLowering(MF); + int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); + unsigned BasePtr; + + unsigned Opc = MI.getOpcode(); + bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm || + Opc == X86::TCRETURNmi || Opc == X86::TCRETURNmi64; + + if (hasBasePointer(MF)) + BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister()); + else if (needsStackRealignment(MF)) + BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr); + else if (AfterFPPop) + BasePtr = StackPtr; + else + BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr); + + // LOCAL_ESCAPE uses a single offset, with no register. It only works in the + // simple FP case, and doesn't work with stack realignment. On 32-bit, the + // offset is from the traditional base pointer location. On 64-bit, the + // offset is from the SP at the end of the prologue, not the FP location. This + // matches the behavior of llvm.frameaddress. + unsigned IgnoredFrameReg; + if (Opc == TargetOpcode::LOCAL_ESCAPE) { + MachineOperand &FI = MI.getOperand(FIOperandNum); + int Offset; + Offset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); + FI.ChangeToImmediate(Offset); + return; + } + + // For LEA64_32r when BasePtr is 32-bits (X32) we can use full-size 64-bit + // register as source operand, semantic is the same and destination is + // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided. + if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr)) + BasePtr = getX86SubSuperRegister(BasePtr, 64); + + // This must be part of a four operand memory reference. Replace the + // FrameIndex with base register with EBP. Add an offset to the offset. + MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false); + + // Now add the frame object offset to the offset from EBP. + int FIOffset; + if (AfterFPPop) { + // Tail call jmp happens after FP is popped. + const MachineFrameInfo *MFI = MF.getFrameInfo(); + FIOffset = MFI->getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea(); + } else + FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); + + if (BasePtr == StackPtr) + FIOffset += SPAdj; + + // The frame index format for stackmaps and patchpoints is different from the + // X86 format. It only has a FI and an offset. + if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) { + assert(BasePtr == FramePtr && "Expected the FP as base register"); + int64_t Offset = MI.getOperand(FIOperandNum + 1).getImm() + FIOffset; + MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset); + return; + } + + if (MI.getOperand(FIOperandNum+3).isImm()) { + // Offset is a 32-bit integer. + int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm()); + int Offset = FIOffset + Imm; + assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) && + "Requesting 64-bit offset in 32-bit immediate!"); + MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset); + } else { + // Offset is symbolic. This is extremely rare. + uint64_t Offset = FIOffset + + (uint64_t)MI.getOperand(FIOperandNum+3).getOffset(); + MI.getOperand(FIOperandNum + 3).setOffset(Offset); + } +} + +unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { + const X86FrameLowering *TFI = getFrameLowering(MF); + return TFI->hasFP(MF) ? FramePtr : StackPtr; +} + +unsigned +X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const { + const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); + unsigned FrameReg = getFrameRegister(MF); + if (Subtarget.isTarget64BitILP32()) + FrameReg = getX86SubSuperRegister(FrameReg, 32); + return FrameReg; +} + +unsigned llvm::get512BitSuperRegister(unsigned Reg) { + if (Reg >= X86::XMM0 && Reg <= X86::XMM31) + return X86::ZMM0 + (Reg - X86::XMM0); + if (Reg >= X86::YMM0 && Reg <= X86::YMM31) + return X86::ZMM0 + (Reg - X86::YMM0); + if (Reg >= X86::ZMM0 && Reg <= X86::ZMM31) + return Reg; + llvm_unreachable("Unexpected SIMD register"); +} diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h new file mode 100644 index 0000000..f014c8f --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h @@ -0,0 +1,143 @@ +//===-- X86RegisterInfo.h - X86 Register Information Impl -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86REGISTERINFO_H +#define LLVM_LIB_TARGET_X86_X86REGISTERINFO_H + +#include "llvm/Target/TargetRegisterInfo.h" + +#define GET_REGINFO_HEADER +#include "X86GenRegisterInfo.inc" + +namespace llvm { + class Triple; + +class X86RegisterInfo final : public X86GenRegisterInfo { +private: + /// Is64Bit - Is the target 64-bits. + /// + bool Is64Bit; + + /// IsWin64 - Is the target on of win64 flavours + /// + bool IsWin64; + + /// SlotSize - Stack slot size in bytes. + /// + unsigned SlotSize; + + /// StackPtr - X86 physical register used as stack ptr. + /// + unsigned StackPtr; + + /// FramePtr - X86 physical register used as frame ptr. + /// + unsigned FramePtr; + + /// BasePtr - X86 physical register used as a base ptr in complex stack + /// frames. I.e., when we need a 3rd base, not just SP and FP, due to + /// variable size stack objects. + unsigned BasePtr; + +public: + X86RegisterInfo(const Triple &TT); + + // FIXME: This should be tablegen'd like getDwarfRegNum is + int getSEHRegNum(unsigned i) const; + + /// Code Generation virtual methods... + /// + bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override; + + /// getMatchingSuperRegClass - Return a subclass of the specified register + /// class A so that each register in it has a sub-register of the + /// specified sub-register index which is in the specified register class B. + const TargetRegisterClass * + getMatchingSuperRegClass(const TargetRegisterClass *A, + const TargetRegisterClass *B, + unsigned Idx) const override; + + const TargetRegisterClass * + getSubClassWithSubReg(const TargetRegisterClass *RC, + unsigned Idx) const override; + + const TargetRegisterClass * + getLargestLegalSuperClass(const TargetRegisterClass *RC, + const MachineFunction &MF) const override; + + /// getPointerRegClass - Returns a TargetRegisterClass used for pointer + /// values. + const TargetRegisterClass * + getPointerRegClass(const MachineFunction &MF, + unsigned Kind = 0) const override; + + /// getCrossCopyRegClass - Returns a legal register class to copy a register + /// in the specified class to or from. Returns NULL if it is possible to copy + /// between a two registers of the specified class. + const TargetRegisterClass * + getCrossCopyRegClass(const TargetRegisterClass *RC) const override; + + /// getGPRsForTailCall - Returns a register class with registers that can be + /// used in forming tail calls. + const TargetRegisterClass * + getGPRsForTailCall(const MachineFunction &MF) const; + + unsigned getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const override; + + /// getCalleeSavedRegs - Return a null-terminated list of all of the + /// callee-save registers on this target. + const MCPhysReg * + getCalleeSavedRegs(const MachineFunction* MF) const override; + const uint32_t *getCallPreservedMask(const MachineFunction &MF, + CallingConv::ID) const override; + const uint32_t *getNoPreservedMask() const override; + + // Calls involved in thread-local variable lookup save more registers than + // normal calls, so they need a different mask to represent this. + const uint32_t *getDarwinTLSCallPreservedMask() const; + + /// getReservedRegs - Returns a bitset indexed by physical register number + /// indicating if a register is a special register that has particular uses and + /// should be considered unavailable at all times, e.g. SP, RA. This is used by + /// register scavenger to determine what registers are free. + BitVector getReservedRegs(const MachineFunction &MF) const override; + + void adjustStackMapLiveOutMask(uint32_t *Mask) const override; + + bool hasBasePointer(const MachineFunction &MF) const; + + bool canRealignStack(const MachineFunction &MF) const override; + + bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, + int &FrameIdx) const override; + + void eliminateFrameIndex(MachineBasicBlock::iterator MI, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS = nullptr) const override; + + // Debug information queries. + unsigned getFrameRegister(const MachineFunction &MF) const override; + unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const; + unsigned getStackRegister() const { return StackPtr; } + unsigned getBaseRegister() const { return BasePtr; } + // FIXME: Move to FrameInfok + unsigned getSlotSize() const { return SlotSize; } +}; + +//get512BitRegister - X86 utility - returns 512-bit super register +unsigned get512BitSuperRegister(unsigned Reg); + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td new file mode 100644 index 0000000..56f0d93 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td @@ -0,0 +1,497 @@ +//===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 Register file, defining the registers themselves, +// aliases between the registers, and the register classes built out of the +// registers. +// +//===----------------------------------------------------------------------===// + +class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n> { + let Namespace = "X86"; + let HWEncoding = Enc; + let SubRegs = subregs; +} + +// Subregister indices. +let Namespace = "X86" in { + def sub_8bit : SubRegIndex<8>; + def sub_8bit_hi : SubRegIndex<8, 8>; + def sub_16bit : SubRegIndex<16>; + def sub_32bit : SubRegIndex<32>; + def sub_xmm : SubRegIndex<128>; + def sub_ymm : SubRegIndex<256>; +} + +//===----------------------------------------------------------------------===// +// Register definitions... +// + +// In the register alias definitions below, we define which registers alias +// which others. We only specify which registers the small registers alias, +// because the register file generator is smart enough to figure out that +// AL aliases AX if we tell it that AX aliased AL (for example). + +// Dwarf numbering is different for 32-bit and 64-bit, and there are +// variations by target as well. Currently the first entry is for X86-64, +// second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux +// and debug information on X86-32/Darwin) + +// 8-bit registers +// Low registers +def AL : X86Reg<"al", 0>; +def DL : X86Reg<"dl", 2>; +def CL : X86Reg<"cl", 1>; +def BL : X86Reg<"bl", 3>; + +// High registers. On x86-64, these cannot be used in any instruction +// with a REX prefix. +def AH : X86Reg<"ah", 4>; +def DH : X86Reg<"dh", 6>; +def CH : X86Reg<"ch", 5>; +def BH : X86Reg<"bh", 7>; + +// X86-64 only, requires REX. +let CostPerUse = 1 in { +def SIL : X86Reg<"sil", 6>; +def DIL : X86Reg<"dil", 7>; +def BPL : X86Reg<"bpl", 5>; +def SPL : X86Reg<"spl", 4>; +def R8B : X86Reg<"r8b", 8>; +def R9B : X86Reg<"r9b", 9>; +def R10B : X86Reg<"r10b", 10>; +def R11B : X86Reg<"r11b", 11>; +def R12B : X86Reg<"r12b", 12>; +def R13B : X86Reg<"r13b", 13>; +def R14B : X86Reg<"r14b", 14>; +def R15B : X86Reg<"r15b", 15>; +} + +// 16-bit registers +let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in { +def AX : X86Reg<"ax", 0, [AL,AH]>; +def DX : X86Reg<"dx", 2, [DL,DH]>; +def CX : X86Reg<"cx", 1, [CL,CH]>; +def BX : X86Reg<"bx", 3, [BL,BH]>; +} +let SubRegIndices = [sub_8bit] in { +def SI : X86Reg<"si", 6, [SIL]>; +def DI : X86Reg<"di", 7, [DIL]>; +def BP : X86Reg<"bp", 5, [BPL]>; +def SP : X86Reg<"sp", 4, [SPL]>; +} +def IP : X86Reg<"ip", 0>; + +// X86-64 only, requires REX. +let SubRegIndices = [sub_8bit], CostPerUse = 1 in { +def R8W : X86Reg<"r8w", 8, [R8B]>; +def R9W : X86Reg<"r9w", 9, [R9B]>; +def R10W : X86Reg<"r10w", 10, [R10B]>; +def R11W : X86Reg<"r11w", 11, [R11B]>; +def R12W : X86Reg<"r12w", 12, [R12B]>; +def R13W : X86Reg<"r13w", 13, [R13B]>; +def R14W : X86Reg<"r14w", 14, [R14B]>; +def R15W : X86Reg<"r15w", 15, [R15B]>; +} + +// 32-bit registers +let SubRegIndices = [sub_16bit] in { +def EAX : X86Reg<"eax", 0, [AX]>, DwarfRegNum<[-2, 0, 0]>; +def EDX : X86Reg<"edx", 2, [DX]>, DwarfRegNum<[-2, 2, 2]>; +def ECX : X86Reg<"ecx", 1, [CX]>, DwarfRegNum<[-2, 1, 1]>; +def EBX : X86Reg<"ebx", 3, [BX]>, DwarfRegNum<[-2, 3, 3]>; +def ESI : X86Reg<"esi", 6, [SI]>, DwarfRegNum<[-2, 6, 6]>; +def EDI : X86Reg<"edi", 7, [DI]>, DwarfRegNum<[-2, 7, 7]>; +def EBP : X86Reg<"ebp", 5, [BP]>, DwarfRegNum<[-2, 4, 5]>; +def ESP : X86Reg<"esp", 4, [SP]>, DwarfRegNum<[-2, 5, 4]>; +def EIP : X86Reg<"eip", 0, [IP]>, DwarfRegNum<[-2, 8, 8]>; + +// X86-64 only, requires REX +let CostPerUse = 1 in { +def R8D : X86Reg<"r8d", 8, [R8W]>; +def R9D : X86Reg<"r9d", 9, [R9W]>; +def R10D : X86Reg<"r10d", 10, [R10W]>; +def R11D : X86Reg<"r11d", 11, [R11W]>; +def R12D : X86Reg<"r12d", 12, [R12W]>; +def R13D : X86Reg<"r13d", 13, [R13W]>; +def R14D : X86Reg<"r14d", 14, [R14W]>; +def R15D : X86Reg<"r15d", 15, [R15W]>; +}} + +// 64-bit registers, X86-64 only +let SubRegIndices = [sub_32bit] in { +def RAX : X86Reg<"rax", 0, [EAX]>, DwarfRegNum<[0, -2, -2]>; +def RDX : X86Reg<"rdx", 2, [EDX]>, DwarfRegNum<[1, -2, -2]>; +def RCX : X86Reg<"rcx", 1, [ECX]>, DwarfRegNum<[2, -2, -2]>; +def RBX : X86Reg<"rbx", 3, [EBX]>, DwarfRegNum<[3, -2, -2]>; +def RSI : X86Reg<"rsi", 6, [ESI]>, DwarfRegNum<[4, -2, -2]>; +def RDI : X86Reg<"rdi", 7, [EDI]>, DwarfRegNum<[5, -2, -2]>; +def RBP : X86Reg<"rbp", 5, [EBP]>, DwarfRegNum<[6, -2, -2]>; +def RSP : X86Reg<"rsp", 4, [ESP]>, DwarfRegNum<[7, -2, -2]>; + +// These also require REX. +let CostPerUse = 1 in { +def R8 : X86Reg<"r8", 8, [R8D]>, DwarfRegNum<[ 8, -2, -2]>; +def R9 : X86Reg<"r9", 9, [R9D]>, DwarfRegNum<[ 9, -2, -2]>; +def R10 : X86Reg<"r10", 10, [R10D]>, DwarfRegNum<[10, -2, -2]>; +def R11 : X86Reg<"r11", 11, [R11D]>, DwarfRegNum<[11, -2, -2]>; +def R12 : X86Reg<"r12", 12, [R12D]>, DwarfRegNum<[12, -2, -2]>; +def R13 : X86Reg<"r13", 13, [R13D]>, DwarfRegNum<[13, -2, -2]>; +def R14 : X86Reg<"r14", 14, [R14D]>, DwarfRegNum<[14, -2, -2]>; +def R15 : X86Reg<"r15", 15, [R15D]>, DwarfRegNum<[15, -2, -2]>; +def RIP : X86Reg<"rip", 0, [EIP]>, DwarfRegNum<[16, -2, -2]>; +}} + +// MMX Registers. These are actually aliased to ST0 .. ST7 +def MM0 : X86Reg<"mm0", 0>, DwarfRegNum<[41, 29, 29]>; +def MM1 : X86Reg<"mm1", 1>, DwarfRegNum<[42, 30, 30]>; +def MM2 : X86Reg<"mm2", 2>, DwarfRegNum<[43, 31, 31]>; +def MM3 : X86Reg<"mm3", 3>, DwarfRegNum<[44, 32, 32]>; +def MM4 : X86Reg<"mm4", 4>, DwarfRegNum<[45, 33, 33]>; +def MM5 : X86Reg<"mm5", 5>, DwarfRegNum<[46, 34, 34]>; +def MM6 : X86Reg<"mm6", 6>, DwarfRegNum<[47, 35, 35]>; +def MM7 : X86Reg<"mm7", 7>, DwarfRegNum<[48, 36, 36]>; + +// Pseudo Floating Point registers +def FP0 : X86Reg<"fp0", 0>; +def FP1 : X86Reg<"fp1", 0>; +def FP2 : X86Reg<"fp2", 0>; +def FP3 : X86Reg<"fp3", 0>; +def FP4 : X86Reg<"fp4", 0>; +def FP5 : X86Reg<"fp5", 0>; +def FP6 : X86Reg<"fp6", 0>; +def FP7 : X86Reg<"fp7", 0>; + +// XMM Registers, used by the various SSE instruction set extensions. +def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>; +def XMM1: X86Reg<"xmm1", 1>, DwarfRegNum<[18, 22, 22]>; +def XMM2: X86Reg<"xmm2", 2>, DwarfRegNum<[19, 23, 23]>; +def XMM3: X86Reg<"xmm3", 3>, DwarfRegNum<[20, 24, 24]>; +def XMM4: X86Reg<"xmm4", 4>, DwarfRegNum<[21, 25, 25]>; +def XMM5: X86Reg<"xmm5", 5>, DwarfRegNum<[22, 26, 26]>; +def XMM6: X86Reg<"xmm6", 6>, DwarfRegNum<[23, 27, 27]>; +def XMM7: X86Reg<"xmm7", 7>, DwarfRegNum<[24, 28, 28]>; + +// X86-64 only +let CostPerUse = 1 in { +def XMM8: X86Reg<"xmm8", 8>, DwarfRegNum<[25, -2, -2]>; +def XMM9: X86Reg<"xmm9", 9>, DwarfRegNum<[26, -2, -2]>; +def XMM10: X86Reg<"xmm10", 10>, DwarfRegNum<[27, -2, -2]>; +def XMM11: X86Reg<"xmm11", 11>, DwarfRegNum<[28, -2, -2]>; +def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>; +def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>; +def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>; +def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>; + +def XMM16: X86Reg<"xmm16", 16>, DwarfRegNum<[60, -2, -2]>; +def XMM17: X86Reg<"xmm17", 17>, DwarfRegNum<[61, -2, -2]>; +def XMM18: X86Reg<"xmm18", 18>, DwarfRegNum<[62, -2, -2]>; +def XMM19: X86Reg<"xmm19", 19>, DwarfRegNum<[63, -2, -2]>; +def XMM20: X86Reg<"xmm20", 20>, DwarfRegNum<[64, -2, -2]>; +def XMM21: X86Reg<"xmm21", 21>, DwarfRegNum<[65, -2, -2]>; +def XMM22: X86Reg<"xmm22", 22>, DwarfRegNum<[66, -2, -2]>; +def XMM23: X86Reg<"xmm23", 23>, DwarfRegNum<[67, -2, -2]>; +def XMM24: X86Reg<"xmm24", 24>, DwarfRegNum<[68, -2, -2]>; +def XMM25: X86Reg<"xmm25", 25>, DwarfRegNum<[69, -2, -2]>; +def XMM26: X86Reg<"xmm26", 26>, DwarfRegNum<[70, -2, -2]>; +def XMM27: X86Reg<"xmm27", 27>, DwarfRegNum<[71, -2, -2]>; +def XMM28: X86Reg<"xmm28", 28>, DwarfRegNum<[72, -2, -2]>; +def XMM29: X86Reg<"xmm29", 29>, DwarfRegNum<[73, -2, -2]>; +def XMM30: X86Reg<"xmm30", 30>, DwarfRegNum<[74, -2, -2]>; +def XMM31: X86Reg<"xmm31", 31>, DwarfRegNum<[75, -2, -2]>; + +} // CostPerUse + +// YMM0-15 registers, used by AVX instructions and +// YMM16-31 registers, used by AVX-512 instructions. +let SubRegIndices = [sub_xmm] in { + foreach Index = 0-31 in { + def YMM#Index : X86Reg<"ymm"#Index, Index, [!cast<X86Reg>("XMM"#Index)]>, + DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>; + } +} + +// ZMM Registers, used by AVX-512 instructions. +let SubRegIndices = [sub_ymm] in { + foreach Index = 0-31 in { + def ZMM#Index : X86Reg<"zmm"#Index, Index, [!cast<X86Reg>("YMM"#Index)]>, + DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>; + } +} + +// Mask Registers, used by AVX-512 instructions. +def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>; +def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>; +def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>; +def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>; +def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>; +def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>; +def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>; +def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>; + +// Floating point stack registers. These don't map one-to-one to the FP +// pseudo registers, but we still mark them as aliasing FP registers. That +// way both kinds can be live without exceeding the stack depth. ST registers +// are only live around inline assembly. +def ST0 : X86Reg<"st(0)", 0>, DwarfRegNum<[33, 12, 11]>; +def ST1 : X86Reg<"st(1)", 1>, DwarfRegNum<[34, 13, 12]>; +def ST2 : X86Reg<"st(2)", 2>, DwarfRegNum<[35, 14, 13]>; +def ST3 : X86Reg<"st(3)", 3>, DwarfRegNum<[36, 15, 14]>; +def ST4 : X86Reg<"st(4)", 4>, DwarfRegNum<[37, 16, 15]>; +def ST5 : X86Reg<"st(5)", 5>, DwarfRegNum<[38, 17, 16]>; +def ST6 : X86Reg<"st(6)", 6>, DwarfRegNum<[39, 18, 17]>; +def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>; + +// Floating-point status word +def FPSW : X86Reg<"fpsw", 0>; + +// Status flags register +def EFLAGS : X86Reg<"flags", 0>; + +// Segment registers +def CS : X86Reg<"cs", 1>; +def DS : X86Reg<"ds", 3>; +def SS : X86Reg<"ss", 2>; +def ES : X86Reg<"es", 0>; +def FS : X86Reg<"fs", 4>; +def GS : X86Reg<"gs", 5>; + +// Debug registers +def DR0 : X86Reg<"dr0", 0>; +def DR1 : X86Reg<"dr1", 1>; +def DR2 : X86Reg<"dr2", 2>; +def DR3 : X86Reg<"dr3", 3>; +def DR4 : X86Reg<"dr4", 4>; +def DR5 : X86Reg<"dr5", 5>; +def DR6 : X86Reg<"dr6", 6>; +def DR7 : X86Reg<"dr7", 7>; +def DR8 : X86Reg<"dr8", 8>; +def DR9 : X86Reg<"dr9", 9>; +def DR10 : X86Reg<"dr10", 10>; +def DR11 : X86Reg<"dr11", 11>; +def DR12 : X86Reg<"dr12", 12>; +def DR13 : X86Reg<"dr13", 13>; +def DR14 : X86Reg<"dr14", 14>; +def DR15 : X86Reg<"dr15", 15>; + +// Control registers +def CR0 : X86Reg<"cr0", 0>; +def CR1 : X86Reg<"cr1", 1>; +def CR2 : X86Reg<"cr2", 2>; +def CR3 : X86Reg<"cr3", 3>; +def CR4 : X86Reg<"cr4", 4>; +def CR5 : X86Reg<"cr5", 5>; +def CR6 : X86Reg<"cr6", 6>; +def CR7 : X86Reg<"cr7", 7>; +def CR8 : X86Reg<"cr8", 8>; +def CR9 : X86Reg<"cr9", 9>; +def CR10 : X86Reg<"cr10", 10>; +def CR11 : X86Reg<"cr11", 11>; +def CR12 : X86Reg<"cr12", 12>; +def CR13 : X86Reg<"cr13", 13>; +def CR14 : X86Reg<"cr14", 14>; +def CR15 : X86Reg<"cr15", 15>; + +// Pseudo index registers +def EIZ : X86Reg<"eiz", 4>; +def RIZ : X86Reg<"riz", 4>; + +// Bound registers, used in MPX instructions +def BND0 : X86Reg<"bnd0", 0>; +def BND1 : X86Reg<"bnd1", 1>; +def BND2 : X86Reg<"bnd2", 2>; +def BND3 : X86Reg<"bnd3", 3>; + +//===----------------------------------------------------------------------===// +// Register Class Definitions... now that we have all of the pieces, define the +// top-level register classes. The order specified in the register list is +// implicitly defined to be the register allocation order. +// + +// List call-clobbered registers before callee-save registers. RBX, RBP, (and +// R12, R13, R14, and R15 for X86-64) are callee-save registers. +// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and +// R8B, ... R15B. +// Allocate R12 and R13 last, as these require an extra byte when +// encoded in x86_64 instructions. +// FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in +// 64-bit mode. The main complication is that they cannot be encoded in an +// instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc. +// require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d" +// cannot be encoded. +def GR8 : RegisterClass<"X86", [i8], 8, + (add AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL, + R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B)> { + let AltOrders = [(sub GR8, AH, BH, CH, DH)]; + let AltOrderSelect = [{ + return MF.getSubtarget<X86Subtarget>().is64Bit(); + }]; +} + +def GR16 : RegisterClass<"X86", [i16], 16, + (add AX, CX, DX, SI, DI, BX, BP, SP, + R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)>; + +def GR32 : RegisterClass<"X86", [i32], 32, + (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP, + R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)>; + +// GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since +// RIP isn't really a register and it can't be used anywhere except in an +// address, but it doesn't cause trouble. +def GR64 : RegisterClass<"X86", [i64], 64, + (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, + RBX, R14, R15, R12, R13, RBP, RSP, RIP)>; + +// Segment registers for use by MOV instructions (and others) that have a +// segment register as one operand. Always contain a 16-bit segment +// descriptor. +def SEGMENT_REG : RegisterClass<"X86", [i16], 16, (add CS, DS, SS, ES, FS, GS)>; + +// Debug registers. +def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 7)>; + +// Control registers. +def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>; + +// GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of +// GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d" +// registers. On x86-32, GR16_ABCD and GR32_ABCD are classes for registers +// that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD, +// and GR64_ABCD are classes for registers that support 8-bit h-register +// operations. +def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>; +def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>; +def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)>; +def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)>; +def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)>; +def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>; +def GR64_TC : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI, + R8, R9, R11, RIP)>; +def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, + R8, R9, R10, R11, RIP)>; + +// GR8_NOREX - GR8 registers which do not require a REX prefix. +def GR8_NOREX : RegisterClass<"X86", [i8], 8, + (add AL, CL, DL, AH, CH, DH, BL, BH)> { + let AltOrders = [(sub GR8_NOREX, AH, BH, CH, DH)]; + let AltOrderSelect = [{ + return MF.getSubtarget<X86Subtarget>().is64Bit(); + }]; +} +// GR16_NOREX - GR16 registers which do not require a REX prefix. +def GR16_NOREX : RegisterClass<"X86", [i16], 16, + (add AX, CX, DX, SI, DI, BX, BP, SP)>; +// GR32_NOREX - GR32 registers which do not require a REX prefix. +def GR32_NOREX : RegisterClass<"X86", [i32], 32, + (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)>; +// GR64_NOREX - GR64 registers which do not require a REX prefix. +def GR64_NOREX : RegisterClass<"X86", [i64], 64, + (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)>; + +// GR32_NOAX - GR32 registers except EAX. Used by AddRegFrm of XCHG32 in 64-bit +// mode to prevent encoding using the 0x90 NOP encoding. xchg %eax, %eax needs +// to clear upper 32-bits of RAX so is not a NOP. +def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)>; + +// GR32_NOSP - GR32 registers except ESP. +def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)>; + +// GR64_NOSP - GR64 registers except RSP (and RIP). +def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)>; + +// GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except +// ESP. +def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32, + (and GR32_NOREX, GR32_NOSP)>; + +// GR64_NOREX_NOSP - GR64_NOREX registers except RSP. +def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64, + (and GR64_NOREX, GR64_NOSP)>; + +// A class to support the 'A' assembler constraint: EAX then EDX. +def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>; + +// Scalar SSE2 floating point registers. +def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>; + +def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>; + +def FR128 : RegisterClass<"X86", [i128, f128], 128, (add FR32)>; + + +// FIXME: This sets up the floating point register files as though they are f64 +// values, though they really are f80 values. This will cause us to spill +// values as 64-bit quantities instead of 80-bit quantities, which is much much +// faster on common hardware. In reality, this should be controlled by a +// command line option or something. + +def RFP32 : RegisterClass<"X86",[f32], 32, (sequence "FP%u", 0, 6)>; +def RFP64 : RegisterClass<"X86",[f64], 32, (add RFP32)>; +def RFP80 : RegisterClass<"X86",[f80], 32, (add RFP32)>; + +// Floating point stack registers (these are not allocatable by the +// register allocator - the floating point stackifier is responsible +// for transforming FPn allocations to STn registers) +def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> { + let isAllocatable = 0; +} + +// Generic vector registers: VR64 and VR128. +// Ensure that float types are declared first - only float is legal on SSE1. +def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>; +def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64], + 128, (add FR32)>; +def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], + 256, (sequence "YMM%u", 0, 15)>; + +// Status flags registers. +def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> { + let CopyCost = -1; // Don't allow copying of status registers. + let isAllocatable = 0; +} +def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> { + let CopyCost = -1; // Don't allow copying of status registers. + let isAllocatable = 0; +} + +// AVX-512 vector/mask registers. +def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], + 512, (sequence "ZMM%u", 0, 31)>; + +// Scalar AVX-512 floating point registers. +def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>; + +def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>; + +// Extended VR128 and VR256 for AVX-512 instructions +def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64], + 128, (add FR32X)>; +def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], + 256, (sequence "YMM%u", 0, 31)>; + +// Mask registers +def VK1 : RegisterClass<"X86", [i1], 8, (sequence "K%u", 0, 7)> {let Size = 8;} +def VK2 : RegisterClass<"X86", [v2i1], 8, (add VK1)> {let Size = 8;} +def VK4 : RegisterClass<"X86", [v4i1], 8, (add VK2)> {let Size = 8;} +def VK8 : RegisterClass<"X86", [v8i1], 8, (add VK4)> {let Size = 8;} +def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;} +def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;} +def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;} + +def VK1WM : RegisterClass<"X86", [i1], 8, (sub VK1, K0)> {let Size = 8;} +def VK2WM : RegisterClass<"X86", [v2i1], 8, (sub VK2, K0)> {let Size = 8;} +def VK4WM : RegisterClass<"X86", [v4i1], 8, (sub VK4, K0)> {let Size = 8;} +def VK8WM : RegisterClass<"X86", [v8i1], 8, (sub VK8, K0)> {let Size = 8;} +def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)> {let Size = 16;} +def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;} +def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} + +// Bound registers +def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>; diff --git a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td new file mode 100644 index 0000000..677e824 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td @@ -0,0 +1,2147 @@ +//=- X86SchedHaswell.td - X86 Haswell Scheduling -------------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Haswell to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def HaswellModel : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and HW can decode 4 + // instructions per cycle. + let IssueWidth = 4; + let MicroOpBufferSize = 192; // Based on the reorder buffer. + let LoadLatency = 4; + let MispredictPenalty = 16; + + // Based on the LSD (loop-stream detector) queue size and benchmarking data. + let LoopMicroOpBufferSize = 50; + + // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow + // the scheduler to assign a default model to unrecognized opcodes. + let CompleteModel = 0; +} + +let SchedModel = HaswellModel in { + +// Haswell can issue micro-ops to 8 different ports in one cycle. + +// Ports 0, 1, 5, and 6 handle all computation. +// Port 4 gets the data half of stores. Store data can be available later than +// the store address, but since we don't model the latency of stores, we can +// ignore that. +// Ports 2 and 3 are identical. They handle loads and the address half of +// stores. Port 7 can handle address calculations. +def HWPort0 : ProcResource<1>; +def HWPort1 : ProcResource<1>; +def HWPort2 : ProcResource<1>; +def HWPort3 : ProcResource<1>; +def HWPort4 : ProcResource<1>; +def HWPort5 : ProcResource<1>; +def HWPort6 : ProcResource<1>; +def HWPort7 : ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def HWPort01 : ProcResGroup<[HWPort0, HWPort1]>; +def HWPort23 : ProcResGroup<[HWPort2, HWPort3]>; +def HWPort237 : ProcResGroup<[HWPort2, HWPort3, HWPort7]>; +def HWPort04 : ProcResGroup<[HWPort0, HWPort4]>; +def HWPort05 : ProcResGroup<[HWPort0, HWPort5]>; +def HWPort06 : ProcResGroup<[HWPort0, HWPort6]>; +def HWPort15 : ProcResGroup<[HWPort1, HWPort5]>; +def HWPort16 : ProcResGroup<[HWPort1, HWPort6]>; +def HWPort56 : ProcResGroup<[HWPort5, HWPort6]>; +def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>; +def HWPort056 : ProcResGroup<[HWPort0, HWPort5, HWPort6]>; +def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>; + +// 60 Entry Unified Scheduler +def HWPortAny : ProcResGroup<[HWPort0, HWPort1, HWPort2, HWPort3, HWPort4, + HWPort5, HWPort6, HWPort7]> { + let BufferSize=60; +} + +// Integer division issued on port 0. +def HWDivider : ProcResource<1>; + +// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 4>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, [HWPort23, ExePort]> { + let Latency = !add(Lat, 4); + } +} + +// A folded store needs a cycle on port 4 for the store data, but it does not +// need an extra port 2/3 cycle to recompute the address. +def : WriteRes<WriteRMW, [HWPort4]>; + +// Store_addr on 237. +// Store_data on 4. +def : WriteRes<WriteStore, [HWPort237, HWPort4]>; +def : WriteRes<WriteLoad, [HWPort23]> { let Latency = 4; } +def : WriteRes<WriteMove, [HWPort0156]>; +def : WriteRes<WriteZero, []>; + +defm : HWWriteResPair<WriteALU, HWPort0156, 1>; +defm : HWWriteResPair<WriteIMul, HWPort1, 3>; +def : WriteRes<WriteIMulH, []> { let Latency = 3; } +defm : HWWriteResPair<WriteShift, HWPort06, 1>; +defm : HWWriteResPair<WriteJump, HWPort06, 1>; + +// This is for simple LEAs with one or two input operands. +// The complex ones can only execute on port 1, and they require two cycles on +// the port to read all inputs. We don't model that. +def : WriteRes<WriteLEA, [HWPort15]>; + +// This is quite rough, latency depends on the dividend. +def : WriteRes<WriteIDiv, [HWPort0, HWDivider]> { + let Latency = 25; + let ResourceCycles = [1, 10]; +} +def : WriteRes<WriteIDivLd, [HWPort23, HWPort0, HWDivider]> { + let Latency = 29; + let ResourceCycles = [1, 1, 10]; +} + +// Scalar and vector floating point. +defm : HWWriteResPair<WriteFAdd, HWPort1, 3>; +defm : HWWriteResPair<WriteFMul, HWPort0, 5>; +defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles. +defm : HWWriteResPair<WriteFRcp, HWPort0, 5>; +defm : HWWriteResPair<WriteFRsqrt, HWPort0, 5>; +defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>; +defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>; +defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>; +defm : HWWriteResPair<WriteCvtF2F, HWPort1, 3>; +defm : HWWriteResPair<WriteFShuffle, HWPort5, 1>; +defm : HWWriteResPair<WriteFBlend, HWPort015, 1>; +defm : HWWriteResPair<WriteFShuffle256, HWPort5, 3>; + +def : WriteRes<WriteFVarBlend, [HWPort5]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteFVarBlendLd, [HWPort5, HWPort23]> { + let Latency = 6; + let ResourceCycles = [2, 1]; +} + +// Vector integer operations. +defm : HWWriteResPair<WriteVecShift, HWPort0, 1>; +defm : HWWriteResPair<WriteVecLogic, HWPort015, 1>; +defm : HWWriteResPair<WriteVecALU, HWPort15, 1>; +defm : HWWriteResPair<WriteVecIMul, HWPort0, 5>; +defm : HWWriteResPair<WriteShuffle, HWPort5, 1>; +defm : HWWriteResPair<WriteBlend, HWPort15, 1>; +defm : HWWriteResPair<WriteShuffle256, HWPort5, 3>; + +def : WriteRes<WriteVarBlend, [HWPort5]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteVarBlendLd, [HWPort5, HWPort23]> { + let Latency = 6; + let ResourceCycles = [2, 1]; +} + +def : WriteRes<WriteVarVecShift, [HWPort0, HWPort5]> { + let Latency = 2; + let ResourceCycles = [2, 1]; +} +def : WriteRes<WriteVarVecShiftLd, [HWPort0, HWPort5, HWPort23]> { + let Latency = 6; + let ResourceCycles = [2, 1, 1]; +} + +def : WriteRes<WriteMPSAD, [HWPort0, HWPort5]> { + let Latency = 6; + let ResourceCycles = [1, 2]; +} +def : WriteRes<WriteMPSADLd, [HWPort23, HWPort0, HWPort5]> { + let Latency = 6; + let ResourceCycles = [1, 1, 2]; +} + +// String instructions. +// Packed Compare Implicit Length Strings, Return Mask +def : WriteRes<WritePCmpIStrM, [HWPort0]> { + let Latency = 10; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrMLd, [HWPort0, HWPort23]> { + let Latency = 10; + let ResourceCycles = [3, 1]; +} + +// Packed Compare Explicit Length Strings, Return Mask +def : WriteRes<WritePCmpEStrM, [HWPort0, HWPort16, HWPort5]> { + let Latency = 10; + let ResourceCycles = [3, 2, 4]; +} +def : WriteRes<WritePCmpEStrMLd, [HWPort05, HWPort16, HWPort23]> { + let Latency = 10; + let ResourceCycles = [6, 2, 1]; +} + +// Packed Compare Implicit Length Strings, Return Index +def : WriteRes<WritePCmpIStrI, [HWPort0]> { + let Latency = 11; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrILd, [HWPort0, HWPort23]> { + let Latency = 11; + let ResourceCycles = [3, 1]; +} + +// Packed Compare Explicit Length Strings, Return Index +def : WriteRes<WritePCmpEStrI, [HWPort05, HWPort16]> { + let Latency = 11; + let ResourceCycles = [6, 2]; +} +def : WriteRes<WritePCmpEStrILd, [HWPort0, HWPort16, HWPort5, HWPort23]> { + let Latency = 11; + let ResourceCycles = [3, 2, 2, 1]; +} + +// AES Instructions. +def : WriteRes<WriteAESDecEnc, [HWPort5]> { + let Latency = 7; + let ResourceCycles = [1]; +} +def : WriteRes<WriteAESDecEncLd, [HWPort5, HWPort23]> { + let Latency = 7; + let ResourceCycles = [1, 1]; +} + +def : WriteRes<WriteAESIMC, [HWPort5]> { + let Latency = 14; + let ResourceCycles = [2]; +} +def : WriteRes<WriteAESIMCLd, [HWPort5, HWPort23]> { + let Latency = 14; + let ResourceCycles = [2, 1]; +} + +def : WriteRes<WriteAESKeyGen, [HWPort0, HWPort5]> { + let Latency = 10; + let ResourceCycles = [2, 8]; +} +def : WriteRes<WriteAESKeyGenLd, [HWPort0, HWPort5, HWPort23]> { + let Latency = 10; + let ResourceCycles = [2, 7, 1]; +} + +// Carry-less multiplication instructions. +def : WriteRes<WriteCLMul, [HWPort0, HWPort5]> { + let Latency = 7; + let ResourceCycles = [2, 1]; +} +def : WriteRes<WriteCLMulLd, [HWPort0, HWPort5, HWPort23]> { + let Latency = 7; + let ResourceCycles = [2, 1, 1]; +} + +def : WriteRes<WriteSystem, [HWPort0156]> { let Latency = 100; } +def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; } +def : WriteRes<WriteFence, [HWPort23, HWPort4]>; +def : WriteRes<WriteNop, []>; + +//================ Exceptions ================// + +//-- Specific Scheduling Models --// + +// Starting with P0. +def WriteP0 : SchedWriteRes<[HWPort0]>; + +def WriteP0_P1_Lat4 : SchedWriteRes<[HWPort0, HWPort1]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} + +def WriteP0_P1_Lat4Ld : SchedWriteRes<[HWPort0, HWPort1, HWPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} + +def WriteP01 : SchedWriteRes<[HWPort01]>; + +def Write2P01 : SchedWriteRes<[HWPort01]> { + let NumMicroOps = 2; +} +def Write3P01 : SchedWriteRes<[HWPort01]> { + let NumMicroOps = 3; +} + +def WriteP015 : SchedWriteRes<[HWPort015]>; + +def WriteP01_P5 : SchedWriteRes<[HWPort01, HWPort5]> { + let NumMicroOps = 2; +} +def WriteP06 : SchedWriteRes<[HWPort06]>; + +def Write2P06 : SchedWriteRes<[HWPort06]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} + +def Write3P06_Lat2 : SchedWriteRes<[HWPort06]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} + +def WriteP0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> { + let NumMicroOps = 2; +} + +def Write2P0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> { + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} + +def Write2P0156_Lat2 : SchedWriteRes<[HWPort0156]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def Write2P0156_Lat2Ld : SchedWriteRes<[HWPort0156, HWPort23]> { + let Latency = 6; + let ResourceCycles = [2, 1]; +} + +def Write5P0156 : SchedWriteRes<[HWPort0156]> { + let NumMicroOps = 5; + let ResourceCycles = [5]; +} + +def WriteP0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> { + let Latency = 1; + let ResourceCycles = [1, 2, 1]; +} + +def Write2P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> { + let Latency = 1; + let ResourceCycles = [2, 2, 1]; +} + +def Write3P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> { + let Latency = 1; + let ResourceCycles = [3, 2, 1]; +} + +// Starting with P1. +def WriteP1 : SchedWriteRes<[HWPort1]>; + +def WriteP1_P23 : SchedWriteRes<[HWPort1, HWPort23]> { + let NumMicroOps = 2; +} +def WriteP1_Lat3 : SchedWriteRes<[HWPort1]> { + let Latency = 3; +} +def WriteP1_Lat3Ld : SchedWriteRes<[HWPort1, HWPort23]> { + let Latency = 7; +} + +def Write2P1 : SchedWriteRes<[HWPort1]> { + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def Write2P1_P23 : SchedWriteRes<[HWPort1, HWPort23]> { + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def WriteP15 : SchedWriteRes<[HWPort15]>; +def WriteP15Ld : SchedWriteRes<[HWPort15, HWPort23]> { + let Latency = 4; +} + +def WriteP1_P5_Lat4 : SchedWriteRes<[HWPort1, HWPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} + +def WriteP1_P5_Lat4Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} + +def WriteP1_P5_Lat6 : SchedWriteRes<[HWPort1, HWPort5]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} + +def WriteP1_P5_Lat6Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} + +// Starting with P2. +def Write2P237_P4 : SchedWriteRes<[HWPort237, HWPort4]> { + let Latency = 1; + let ResourceCycles = [2, 1]; +} + +// Starting with P5. +def WriteP5 : SchedWriteRes<[HWPort5]>; +def WriteP5Ld : SchedWriteRes<[HWPort5, HWPort23]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} + +// Notation: +// - r: register. +// - mm: 64 bit mmx register. +// - x = 128 bit xmm register. +// - (x)mm = mmx or xmm register. +// - y = 256 bit ymm register. +// - v = any vector register. +// - m = memory. + +//=== Integer Instructions ===// +//-- Move instructions --// + +// MOV. +// r16,m. +def : InstRW<[WriteALULd], (instregex "MOV16rm")>; + +// MOVSX, MOVZX. +// r,m. +def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>; + +// CMOVcc. +// r,r. +def : InstRW<[Write2P0156_Lat2], + (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rr")>; +// r,m. +def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd], + (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rm")>; + +// XCHG. +// r,r. +def WriteXCHG : SchedWriteRes<[HWPort0156]> { + let Latency = 2; + let ResourceCycles = [3]; +} + +def : InstRW<[WriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>; + +// r,m. +def WriteXCHGrm : SchedWriteRes<[]> { + let Latency = 21; + let NumMicroOps = 8; +} +def : InstRW<[WriteXCHGrm], (instregex "XCHG(8|16|32|64)rm")>; + +// XLAT. +def WriteXLAT : SchedWriteRes<[]> { + let Latency = 7; + let NumMicroOps = 3; +} +def : InstRW<[WriteXLAT], (instregex "XLAT")>; + +// PUSH. +// m. +def : InstRW<[Write2P237_P4], (instregex "PUSH(16|32)rmm")>; + +// PUSHF. +def WritePushF : SchedWriteRes<[HWPort1, HWPort4, HWPort237, HWPort06]> { + let NumMicroOps = 4; +} +def : InstRW<[WritePushF], (instregex "PUSHF(16|32)")>; + +// PUSHA. +def WritePushA : SchedWriteRes<[]> { + let NumMicroOps = 19; +} +def : InstRW<[WritePushA], (instregex "PUSHA(16|32)")>; + +// POP. +// m. +def : InstRW<[Write2P237_P4], (instregex "POP(16|32)rmm")>; + +// POPF. +def WritePopF : SchedWriteRes<[]> { + let NumMicroOps = 9; +} +def : InstRW<[WritePopF], (instregex "POPF(16|32)")>; + +// POPA. +def WritePopA : SchedWriteRes<[]> { + let NumMicroOps = 18; +} +def : InstRW<[WritePopA], (instregex "POPA(16|32)")>; + +// LAHF SAHF. +def : InstRW<[WriteP06], (instregex "(S|L)AHF")>; + +// BSWAP. +// r32. +def WriteBSwap32 : SchedWriteRes<[HWPort15]>; +def : InstRW<[WriteBSwap32], (instregex "BSWAP32r")>; + +// r64. +def WriteBSwap64 : SchedWriteRes<[HWPort06, HWPort15]> { + let NumMicroOps = 2; +} +def : InstRW<[WriteBSwap64], (instregex "BSWAP64r")>; + +// MOVBE. +// r16,m16 / r64,m64. +def : InstRW<[Write2P0156_Lat2Ld], (instregex "MOVBE(16|64)rm")>; + +// r32, m32. +def WriteMoveBE32rm : SchedWriteRes<[HWPort15, HWPort23]> { + let NumMicroOps = 2; +} +def : InstRW<[WriteMoveBE32rm], (instregex "MOVBE32rm")>; + +// m16,r16. +def WriteMoveBE16mr : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteMoveBE16mr], (instregex "MOVBE16mr")>; + +// m32,r32. +def WriteMoveBE32mr : SchedWriteRes<[HWPort15, HWPort237, HWPort4]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteMoveBE32mr], (instregex "MOVBE32mr")>; + +// m64,r64. +def WriteMoveBE64mr : SchedWriteRes<[HWPort06, HWPort15, HWPort237, HWPort4]> { + let NumMicroOps = 4; +} +def : InstRW<[WriteMoveBE64mr], (instregex "MOVBE64mr")>; + +//-- Arithmetic instructions --// + +// ADD SUB. +// m,r/i. +def : InstRW<[Write2P0156_2P237_P4], + (instregex "(ADD|SUB)(8|16|32|64)m(r|i)", + "(ADD|SUB)(8|16|32|64)mi8", "(ADD|SUB)64mi32")>; + +// ADC SBB. +// r,r/i. +def : InstRW<[Write2P0156_Lat2], (instregex "(ADC|SBB)(8|16|32|64)r(r|i)", + "(ADC|SBB)(16|32|64)ri8", + "(ADC|SBB)64ri32", + "(ADC|SBB)(8|16|32|64)rr_REV")>; + +// r,m. +def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd], (instregex "(ADC|SBB)(8|16|32|64)rm")>; + +// m,r/i. +def : InstRW<[Write3P0156_2P237_P4], + (instregex "(ADC|SBB)(8|16|32|64)m(r|i)", + "(ADC|SBB)(16|32|64)mi8", + "(ADC|SBB)64mi32")>; + +// INC DEC NOT NEG. +// m. +def : InstRW<[WriteP0156_2P237_P4], + (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m", + "(INC|DEC)64(16|32)m")>; + +// MUL IMUL. +// r16. +def WriteMul16 : SchedWriteRes<[HWPort1, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; +} +def : InstRW<[WriteMul16], (instregex "IMUL16r", "MUL16r")>; + +// m16. +def WriteMul16Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> { + let Latency = 8; + let NumMicroOps = 5; +} +def : InstRW<[WriteMul16Ld], (instregex "IMUL16m", "MUL16m")>; + +// r32. +def WriteMul32 : SchedWriteRes<[HWPort1, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 3; +} +def : InstRW<[WriteMul32], (instregex "IMUL32r", "MUL32r")>; + +// m32. +def WriteMul32Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> { + let Latency = 8; + let NumMicroOps = 4; +} +def : InstRW<[WriteMul32Ld], (instregex "IMUL32m", "MUL32m")>; + +// r64. +def WriteMul64 : SchedWriteRes<[HWPort1, HWPort6]> { + let Latency = 3; + let NumMicroOps = 2; +} +def : InstRW<[WriteMul64], (instregex "IMUL64r", "MUL64r")>; + +// m64. +def WriteMul64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> { + let Latency = 7; + let NumMicroOps = 3; +} +def : InstRW<[WriteMul64Ld], (instregex "IMUL64m", "MUL64m")>; + +// r16,r16. +def WriteMul16rri : SchedWriteRes<[HWPort1, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 2; +} +def : InstRW<[WriteMul16rri], (instregex "IMUL16rri", "IMUL16rri8")>; + +// r16,m16. +def WriteMul16rmi : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> { + let Latency = 8; + let NumMicroOps = 3; +} +def : InstRW<[WriteMul16rmi], (instregex "IMUL16rmi", "IMUL16rmi8")>; + +// MULX. +// r32,r32,r32. +def WriteMulX32 : SchedWriteRes<[HWPort1, HWPort056]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1, 2]; +} +def : InstRW<[WriteMulX32], (instregex "MULX32rr")>; + +// r32,r32,m32. +def WriteMulX32Ld : SchedWriteRes<[HWPort1, HWPort056, HWPort23]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [1, 2, 1]; +} +def : InstRW<[WriteMulX32Ld], (instregex "MULX32rm")>; + +// r64,r64,r64. +def WriteMulX64 : SchedWriteRes<[HWPort1, HWPort6]> { + let Latency = 4; + let NumMicroOps = 2; +} +def : InstRW<[WriteMulX64], (instregex "MULX64rr")>; + +// r64,r64,m64. +def WriteMulX64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> { + let Latency = 8; + let NumMicroOps = 3; +} +def : InstRW<[WriteMulX64Ld], (instregex "MULX64rm")>; + +// DIV. +// r8. +def WriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 22; + let NumMicroOps = 9; +} +def : InstRW<[WriteDiv8], (instregex "DIV8r")>; + +// r16. +def WriteDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 23; + let NumMicroOps = 10; +} +def : InstRW<[WriteDiv16], (instregex "DIV16r")>; + +// r32. +def WriteDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 22; + let NumMicroOps = 10; +} +def : InstRW<[WriteDiv32], (instregex "DIV32r")>; + +// r64. +def WriteDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 32; + let NumMicroOps = 36; +} +def : InstRW<[WriteDiv64], (instregex "DIV64r")>; + +// IDIV. +// r8. +def WriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 23; + let NumMicroOps = 9; +} +def : InstRW<[WriteIDiv8], (instregex "IDIV8r")>; + +// r16. +def WriteIDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 23; + let NumMicroOps = 10; +} +def : InstRW<[WriteIDiv16], (instregex "IDIV16r")>; + +// r32. +def WriteIDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 22; + let NumMicroOps = 9; +} +def : InstRW<[WriteIDiv32], (instregex "IDIV32r")>; + +// r64. +def WriteIDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 39; + let NumMicroOps = 59; +} +def : InstRW<[WriteIDiv64], (instregex "IDIV64r")>; + +//-- Logic instructions --// + +// AND OR XOR. +// m,r/i. +def : InstRW<[Write2P0156_2P237_P4], + (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)", + "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>; + +// SHR SHL SAR. +// m,i. +def WriteShiftRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteShiftRMW], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>; + +// r,cl. +def : InstRW<[Write3P06_Lat2], (instregex "S(A|H)(R|L)(8|16|32|64)rCL")>; + +// m,cl. +def WriteShiftClLdRMW : SchedWriteRes<[HWPort06, HWPort23, HWPort4]> { + let NumMicroOps = 6; + let ResourceCycles = [3, 2, 1]; +} +def : InstRW<[WriteShiftClLdRMW], (instregex "S(A|H)(R|L)(8|16|32|64)mCL")>; + +// ROR ROL. +// r,1. +def : InstRW<[Write2P06], (instregex "RO(R|L)(8|16|32|64)r1")>; + +// m,i. +def WriteRotateRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { + let NumMicroOps = 5; + let ResourceCycles = [2, 2, 1]; +} +def : InstRW<[WriteRotateRMW], (instregex "RO(R|L)(8|16|32|64)mi")>; + +// r,cl. +def : InstRW<[Write3P06_Lat2], (instregex "RO(R|L)(8|16|32|64)rCL")>; + +// m,cl. +def WriteRotateRMWCL : SchedWriteRes<[]> { + let NumMicroOps = 6; +} +def : InstRW<[WriteRotateRMWCL], (instregex "RO(R|L)(8|16|32|64)mCL")>; + +// RCR RCL. +// r,1. +def WriteRCr1 : SchedWriteRes<[HWPort06, HWPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteRCr1], (instregex "RC(R|L)(8|16|32|64)r1")>; + +// m,1. +def WriteRCm1 : SchedWriteRes<[]> { + let NumMicroOps = 6; +} +def : InstRW<[WriteRCm1], (instregex "RC(R|L)(8|16|32|64)m1")>; + +// r,i. +def WriteRCri : SchedWriteRes<[HWPort0156]> { + let Latency = 6; + let NumMicroOps = 8; +} +def : InstRW<[WriteRCri], (instregex "RC(R|L)(8|16|32|64)r(i|CL)")>; + +// m,i. +def WriteRCmi : SchedWriteRes<[]> { + let NumMicroOps = 11; +} +def : InstRW<[WriteRCmi], (instregex "RC(R|L)(8|16|32|64)m(i|CL)")>; + +// SHRD SHLD. +// r,r,i. +def WriteShDrr : SchedWriteRes<[HWPort1]> { + let Latency = 3; +} +def : InstRW<[WriteShDrr], (instregex "SH(R|L)D(16|32|64)rri8")>; + +// m,r,i. +def WriteShDmr : SchedWriteRes<[]> { + let NumMicroOps = 5; +} +def : InstRW<[WriteShDmr], (instregex "SH(R|L)D(16|32|64)mri8")>; + +// r,r,cl. +def WriteShlDCL : SchedWriteRes<[HWPort0156]> { + let Latency = 3; + let NumMicroOps = 4; +} +def : InstRW<[WriteShlDCL], (instregex "SHLD(16|32|64)rrCL")>; + +// r,r,cl. +def WriteShrDCL : SchedWriteRes<[HWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; +} +def : InstRW<[WriteShrDCL], (instregex "SHRD(16|32|64)rrCL")>; + +// m,r,cl. +def WriteShDmrCL : SchedWriteRes<[]> { + let NumMicroOps = 7; +} +def : InstRW<[WriteShDmrCL], (instregex "SH(R|L)D(16|32|64)mrCL")>; + +// BT. +// r,r/i. +def : InstRW<[WriteShift], (instregex "BT(16|32|64)r(r|i8)")>; + +// m,r. +def WriteBTmr : SchedWriteRes<[]> { + let NumMicroOps = 10; +} +def : InstRW<[WriteBTmr], (instregex "BT(16|32|64)mr")>; + +// m,i. +def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>; + +// BTR BTS BTC. +// r,r,i. +def : InstRW<[WriteShift], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>; + +// m,r. +def WriteBTRSCmr : SchedWriteRes<[]> { + let NumMicroOps = 11; +} +def : InstRW<[WriteBTRSCmr], (instregex "BT(R|S|C)(16|32|64)mr")>; + +// m,i. +def : InstRW<[WriteShiftLd], (instregex "BT(R|S|C)(16|32|64)mi8")>; + +// BSF BSR. +// r,r. +def : InstRW<[WriteP1_Lat3], (instregex "BS(R|F)(16|32|64)rr")>; +// r,m. +def : InstRW<[WriteP1_Lat3Ld], (instregex "BS(R|F)(16|32|64)rm")>; + +// SETcc. +// r. +def : InstRW<[WriteShift], + (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)r")>; +// m. +def WriteSetCCm : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteSetCCm], + (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)m")>; + +// CLD STD. +def WriteCldStd : SchedWriteRes<[HWPort15, HWPort6]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteCldStd], (instregex "STD", "CLD")>; + +// LZCNT TZCNT. +// r,r. +def : InstRW<[WriteP1_Lat3], (instregex "(L|TZCNT)(16|32|64)rr")>; +// r,m. +def : InstRW<[WriteP1_Lat3Ld], (instregex "(L|TZCNT)(16|32|64)rm")>; + +// ANDN. +// r,r. +def : InstRW<[WriteP15], (instregex "ANDN(32|64)rr")>; +// r,m. +def : InstRW<[WriteP15Ld], (instregex "ANDN(32|64)rm")>; + +// BLSI BLSMSK BLSR. +// r,r. +def : InstRW<[WriteP15], (instregex "BLS(I|MSK|R)(32|64)rr")>; +// r,m. +def : InstRW<[WriteP15Ld], (instregex "BLS(I|MSK|R)(32|64)rm")>; + +// BEXTR. +// r,r,r. +def : InstRW<[Write2P0156_Lat2], (instregex "BEXTR(32|64)rr")>; +// r,m,r. +def : InstRW<[Write2P0156_Lat2Ld], (instregex "BEXTR(32|64)rm")>; + +// BZHI. +// r,r,r. +def : InstRW<[WriteP15], (instregex "BZHI(32|64)rr")>; +// r,m,r. +def : InstRW<[WriteP15Ld], (instregex "BZHI(32|64)rm")>; + +// PDEP PEXT. +// r,r,r. +def : InstRW<[WriteP1_Lat3], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>; +// r,m,r. +def : InstRW<[WriteP1_Lat3Ld], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>; + +//-- Control transfer instructions --// + +// J(E|R)CXZ. +def WriteJCXZ : SchedWriteRes<[HWPort0156, HWPort6]> { + let NumMicroOps = 2; +} +def : InstRW<[WriteJCXZ], (instregex "JCXZ", "JECXZ_(32|64)", "JRCXZ")>; + +// LOOP. +def WriteLOOP : SchedWriteRes<[]> { + let NumMicroOps = 7; +} +def : InstRW<[WriteLOOP], (instregex "LOOP")>; + +// LOOP(N)E +def WriteLOOPE : SchedWriteRes<[]> { + let NumMicroOps = 11; +} +def : InstRW<[WriteLOOPE], (instregex "LOOPE", "LOOPNE")>; + +// CALL. +// r. +def WriteCALLr : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteCALLr], (instregex "CALL(16|32)r")>; + +// m. +def WriteCALLm : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> { + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteCALLm], (instregex "CALL(16|32)m")>; + +// RET. +def WriteRET : SchedWriteRes<[HWPort237, HWPort6]> { + let NumMicroOps = 2; +} +def : InstRW<[WriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)")>; + +// i. +def WriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> { + let NumMicroOps = 4; + let ResourceCycles = [1, 2, 1]; +} +def : InstRW<[WriteRETI], (instregex "RETI(L|Q|W)", "LRETI(L|Q|W)")>; + +// BOUND. +// r,m. +def WriteBOUND : SchedWriteRes<[]> { + let NumMicroOps = 15; +} +def : InstRW<[WriteBOUND], (instregex "BOUNDS(16|32)rm")>; + +// INTO. +def WriteINTO : SchedWriteRes<[]> { + let NumMicroOps = 4; +} +def : InstRW<[WriteINTO], (instregex "INTO")>; + +//-- String instructions --// + +// LODSB/W. +def : InstRW<[Write2P0156_P23], (instregex "LODS(B|W)")>; + +// LODSD/Q. +def : InstRW<[WriteP0156_P23], (instregex "LODS(L|Q)")>; + +// STOS. +def WriteSTOS : SchedWriteRes<[HWPort23, HWPort0156, HWPort4]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteSTOS], (instregex "STOS(B|L|Q|W)")>; + +// MOVS. +def WriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 5; + let ResourceCycles = [2, 1, 2]; +} +def : InstRW<[WriteMOVS], (instregex "MOVS(B|L|Q|W)")>; + +// SCAS. +def : InstRW<[Write2P0156_P23], (instregex "SCAS(B|W|L|Q)")>; + +// CMPS. +def WriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 5; + let ResourceCycles = [2, 3]; +} +def : InstRW<[WriteCMPS], (instregex "CMPS(B|L|Q|W)")>; + +//-- Synchronization instructions --// + +// XADD. +def WriteXADD : SchedWriteRes<[]> { + let NumMicroOps = 5; +} +def : InstRW<[WriteXADD], (instregex "XADD(8|16|32|64)rm")>; + +// CMPXCHG. +def WriteCMPXCHG : SchedWriteRes<[]> { + let NumMicroOps = 6; +} +def : InstRW<[WriteCMPXCHG], (instregex "CMPXCHG(8|16|32|64)rm")>; + +// CMPXCHG8B. +def WriteCMPXCHG8B : SchedWriteRes<[]> { + let NumMicroOps = 15; +} +def : InstRW<[WriteCMPXCHG8B], (instregex "CMPXCHG8B")>; + +// CMPXCHG16B. +def WriteCMPXCHG16B : SchedWriteRes<[]> { + let NumMicroOps = 22; +} +def : InstRW<[WriteCMPXCHG16B], (instregex "CMPXCHG16B")>; + +//-- Other --// + +// PAUSE. +def WritePAUSE : SchedWriteRes<[HWPort05, HWPort6]> { + let NumMicroOps = 5; + let ResourceCycles = [1, 3]; +} +def : InstRW<[WritePAUSE], (instregex "PAUSE")>; + +// LEAVE. +def : InstRW<[Write2P0156_P23], (instregex "LEAVE")>; + +// XGETBV. +def WriteXGETBV : SchedWriteRes<[]> { + let NumMicroOps = 8; +} +def : InstRW<[WriteXGETBV], (instregex "XGETBV")>; + +// RDTSC. +def WriteRDTSC : SchedWriteRes<[]> { + let NumMicroOps = 15; +} +def : InstRW<[WriteRDTSC], (instregex "RDTSC")>; + +// RDPMC. +def WriteRDPMC : SchedWriteRes<[]> { + let NumMicroOps = 34; +} +def : InstRW<[WriteRDPMC], (instregex "RDPMC")>; + +// RDRAND. +def WriteRDRAND : SchedWriteRes<[HWPort23, HWPort015]> { + let NumMicroOps = 17; + let ResourceCycles = [1, 16]; +} +def : InstRW<[WriteRDRAND], (instregex "RDRAND(16|32|64)r")>; + +//=== Floating Point x87 Instructions ===// +//-- Move instructions --// + +// FLD. +// m80. +def : InstRW<[WriteP01], (instregex "LD_Frr")>; + +def WriteLD_F80m : SchedWriteRes<[HWPort01, HWPort23]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [2, 2]; +} +def : InstRW<[WriteLD_F80m], (instregex "LD_F80m")>; + +// FBLD. +// m80. +def WriteFBLD : SchedWriteRes<[]> { + let Latency = 47; + let NumMicroOps = 43; +} +def : InstRW<[WriteFBLD], (instregex "FBLDm")>; + +// FST(P). +// r. +def : InstRW<[WriteP01], (instregex "ST_(F|FP)rr")>; + +// m80. +def WriteST_FP80m : SchedWriteRes<[HWPort0156, HWPort23, HWPort4]> { + let NumMicroOps = 7; + let ResourceCycles = [3, 2, 2]; +} +def : InstRW<[WriteST_FP80m], (instregex "ST_FP80m")>; + +// FBSTP. +// m80. +def WriteFBSTP : SchedWriteRes<[]> { + let NumMicroOps = 226; +} +def : InstRW<[WriteFBSTP], (instregex "FBSTPm")>; + +// FXCHG. +def : InstRW<[WriteNop], (instregex "XCH_F")>; + +// FILD. +def WriteFILD : SchedWriteRes<[HWPort01, HWPort23]> { + let Latency = 6; + let NumMicroOps = 2; +} +def : InstRW<[WriteFILD], (instregex "ILD_F(16|32|64)m")>; + +// FIST(P) FISTTP. +def WriteFIST : SchedWriteRes<[HWPort1, HWPort23, HWPort4]> { + let Latency = 7; + let NumMicroOps = 3; +} +def : InstRW<[WriteFIST], (instregex "IST_(F|FP)(16|32)m")>; + +// FLDZ. +def : InstRW<[WriteP01], (instregex "LD_F0")>; + +// FLD1. +def : InstRW<[Write2P01], (instregex "LD_F1")>; + +// FLDPI FLDL2E etc. +def : InstRW<[Write2P01], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>; + +// FCMOVcc. +def WriteFCMOVcc : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteFCMOVcc], (instregex "CMOV(B|BE|P|NB|NBE|NE|NP)_F")>; + +// FNSTSW. +// AX. +def WriteFNSTSW : SchedWriteRes<[HWPort0, HWPort0156]> { + let NumMicroOps = 2; +} +def : InstRW<[WriteFNSTSW], (instregex "FNSTSW16r")>; + +// m16. +def WriteFNSTSWm : SchedWriteRes<[HWPort0, HWPort4, HWPort237]> { + let Latency = 6; + let NumMicroOps = 3; +} +def : InstRW<[WriteFNSTSWm], (instregex "FNSTSWm")>; + +// FLDCW. +def WriteFLDCW : SchedWriteRes<[HWPort01, HWPort23, HWPort6]> { + let Latency = 7; + let NumMicroOps = 3; +} +def : InstRW<[WriteFLDCW], (instregex "FLDCW16m")>; + +// FNSTCW. +def WriteFNSTCW : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteFNSTCW], (instregex "FNSTCW16m")>; + +// FINCSTP FDECSTP. +def : InstRW<[WriteP01], (instregex "FINCSTP", "FDECSTP")>; + +// FFREE. +def : InstRW<[WriteP01], (instregex "FFREE")>; + +// FNSAVE. +def WriteFNSAVE : SchedWriteRes<[]> { + let NumMicroOps = 147; +} +def : InstRW<[WriteFNSAVE], (instregex "FSAVEm")>; + +// FRSTOR. +def WriteFRSTOR : SchedWriteRes<[]> { + let NumMicroOps = 90; +} +def : InstRW<[WriteFRSTOR], (instregex "FRSTORm")>; + +//-- Arithmetic instructions --// + +// FABS. +def : InstRW<[WriteP0], (instregex "ABS_F")>; + +// FCHS. +def : InstRW<[WriteP0], (instregex "CHS_F")>; + +// FCOM(P) FUCOM(P). +// r. +def : InstRW<[WriteP1], (instregex "COM_FST0r", "COMP_FST0r", "UCOM_Fr", + "UCOM_FPr")>; +// m. +def : InstRW<[WriteP1_P23], (instregex "FCOM(32|64)m", "FCOMP(32|64)m")>; + +// FCOMPP FUCOMPP. +// r. +def : InstRW<[Write2P01], (instregex "FCOMPP", "UCOM_FPPr")>; + +// FCOMI(P) FUCOMI(P). +// m. +def : InstRW<[Write3P01], (instregex "COM_FIr", "COM_FIPr", "UCOM_FIr", + "UCOM_FIPr")>; + +// FICOM(P). +def : InstRW<[Write2P1_P23], (instregex "FICOM(16|32)m", "FICOMP(16|32)m")>; + +// FTST. +def : InstRW<[WriteP1], (instregex "TST_F")>; + +// FXAM. +def : InstRW<[Write2P1], (instregex "FXAM")>; + +// FPREM. +def WriteFPREM : SchedWriteRes<[]> { + let Latency = 19; + let NumMicroOps = 28; +} +def : InstRW<[WriteFPREM], (instregex "FPREM")>; + +// FPREM1. +def WriteFPREM1 : SchedWriteRes<[]> { + let Latency = 27; + let NumMicroOps = 41; +} +def : InstRW<[WriteFPREM1], (instregex "FPREM1")>; + +// FRNDINT. +def WriteFRNDINT : SchedWriteRes<[]> { + let Latency = 11; + let NumMicroOps = 17; +} +def : InstRW<[WriteFRNDINT], (instregex "FRNDINT")>; + +//-- Math instructions --// + +// FSCALE. +def WriteFSCALE : SchedWriteRes<[]> { + let Latency = 75; // 49-125 + let NumMicroOps = 50; // 25-75 +} +def : InstRW<[WriteFSCALE], (instregex "FSCALE")>; + +// FXTRACT. +def WriteFXTRACT : SchedWriteRes<[]> { + let Latency = 15; + let NumMicroOps = 17; +} +def : InstRW<[WriteFXTRACT], (instregex "FXTRACT")>; + +//-- Other instructions --// + +// FNOP. +def : InstRW<[WriteP01], (instregex "FNOP")>; + +// WAIT. +def : InstRW<[Write2P01], (instregex "WAIT")>; + +// FNCLEX. +def : InstRW<[Write5P0156], (instregex "FNCLEX")>; + +// FNINIT. +def WriteFNINIT : SchedWriteRes<[]> { + let NumMicroOps = 26; +} +def : InstRW<[WriteFNINIT], (instregex "FNINIT")>; + +//=== Integer MMX and XMM Instructions ===// +//-- Move instructions --// + +// MOVD. +// r32/64 <- (x)mm. +def : InstRW<[WriteP0], (instregex "MMX_MOVD64grr", "MMX_MOVD64from64rr", + "VMOVPDI2DIrr", "MOVPDI2DIrr")>; + +// (x)mm <- r32/64. +def : InstRW<[WriteP5], (instregex "MMX_MOVD64rr", "MMX_MOVD64to64rr", + "VMOVDI2PDIrr", "MOVDI2PDIrr")>; + +// MOVQ. +// r64 <- (x)mm. +def : InstRW<[WriteP0], (instregex "VMOVPQIto64rr")>; + +// (x)mm <- r64. +def : InstRW<[WriteP5], (instregex "VMOV64toPQIrr", "VMOVZQI2PQIrr")>; + +// (x)mm <- (x)mm. +def : InstRW<[WriteP015], (instregex "MMX_MOVQ64rr")>; + +// (V)MOVDQA/U. +// x <- x. +def : InstRW<[WriteP015], (instregex "MOVDQ(A|U)rr", "VMOVDQ(A|U)rr", + "MOVDQ(A|U)rr_REV", "VMOVDQ(A|U)rr_REV", + "VMOVDQ(A|U)Yrr", "VMOVDQ(A|U)Yrr_REV")>; + +// MOVDQ2Q. +def : InstRW<[WriteP01_P5], (instregex "MMX_MOVDQ2Qrr")>; + +// MOVQ2DQ. +def : InstRW<[WriteP015], (instregex "MMX_MOVQ2DQrr")>; + + +// PACKSSWB/DW. +// mm <- mm. +def WriteMMXPACKSSrr : SchedWriteRes<[HWPort5]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def : InstRW<[WriteMMXPACKSSrr], (instregex "MMX_PACKSSDWirr", + "MMX_PACKSSWBirr", "MMX_PACKUSWBirr")>; + +// mm <- m64. +def WriteMMXPACKSSrm : SchedWriteRes<[HWPort23, HWPort5]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1, 3]; +} +def : InstRW<[WriteMMXPACKSSrm], (instregex "MMX_PACKSSDWirm", + "MMX_PACKSSWBirm", "MMX_PACKUSWBirm")>; + +// VPMOVSX/ZX BW BD BQ DW DQ. +// y <- x. +def WriteVPMOVSX : SchedWriteRes<[HWPort5]> { + let Latency = 3; + let NumMicroOps = 1; +} +def : InstRW<[WriteVPMOVSX], (instregex "VPMOV(SX|ZX)(BW|BQ|DW|DQ)Yrr")>; + +// PBLENDW. +// x,x,i / v,v,v,i +def WritePBLENDWr : SchedWriteRes<[HWPort5]>; +def : InstRW<[WritePBLENDWr], (instregex "(V?)PBLENDW(Y?)rri")>; + +// x,m,i / v,v,m,i +def WritePBLENDWm : SchedWriteRes<[HWPort5, HWPort23]> { + let NumMicroOps = 2; + let Latency = 4; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WritePBLENDWm, ReadAfterLd], (instregex "(V?)PBLENDW(Y?)rmi")>; + +// VPBLENDD. +// v,v,v,i. +def WriteVPBLENDDr : SchedWriteRes<[HWPort015]>; +def : InstRW<[WriteVPBLENDDr], (instregex "VPBLENDD(Y?)rri")>; + +// v,v,m,i +def WriteVPBLENDDm : SchedWriteRes<[HWPort015, HWPort23]> { + let NumMicroOps = 2; + let Latency = 4; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteVPBLENDDm, ReadAfterLd], (instregex "VPBLENDD(Y?)rmi")>; + +// MASKMOVQ. +def WriteMASKMOVQ : SchedWriteRes<[HWPort0, HWPort4, HWPort23]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 2]; +} +def : InstRW<[WriteMASKMOVQ], (instregex "MMX_MASKMOVQ(64)?")>; + +// MASKMOVDQU. +def WriteMASKMOVDQU : SchedWriteRes<[HWPort04, HWPort56, HWPort23]> { + let Latency = 14; + let NumMicroOps = 10; + let ResourceCycles = [4, 2, 4]; +} +def : InstRW<[WriteMASKMOVDQU], (instregex "(V?)MASKMOVDQU(64)?")>; + +// VPMASKMOV D/Q. +// v,v,m. +def WriteVPMASKMOVr : SchedWriteRes<[HWPort5, HWPort23]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVPMASKMOVr, ReadAfterLd], + (instregex "VPMASKMOV(D|Q)(Y?)rm")>; + +// m, v,v. +def WriteVPMASKMOVm : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 1, 1]; +} +def : InstRW<[WriteVPMASKMOVm], (instregex "VPMASKMOV(D|Q)(Y?)mr")>; + +// PMOVMSKB. +def WritePMOVMSKB : SchedWriteRes<[HWPort0]> { + let Latency = 3; +} +def : InstRW<[WritePMOVMSKB], (instregex "(V|MMX_)?PMOVMSKB(Y?)rr")>; + +// PEXTR B/W/D/Q. +// r32,x,i. +def WritePEXTRr : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WritePEXTRr], (instregex "PEXTR(B|W|D|Q)rr", "MMX_PEXTRWirri")>; + +// m8,x,i. +def WritePEXTRm : SchedWriteRes<[HWPort23, HWPort4, HWPort5]> { + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WritePEXTRm], (instregex "PEXTR(B|W|D|Q)mr")>; + +// VPBROADCAST B/W. +// x, m8/16. +def WriteVPBROADCAST128Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WriteVPBROADCAST128Ld, ReadAfterLd], + (instregex "VPBROADCAST(B|W)rm")>; + +// y, m8/16 +def WriteVPBROADCAST256Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WriteVPBROADCAST256Ld, ReadAfterLd], + (instregex "VPBROADCAST(B|W)Yrm")>; + +// VPGATHERDD. +// x. +def WriteVPGATHERDD128 : SchedWriteRes<[]> { + let NumMicroOps = 20; +} +def : InstRW<[WriteVPGATHERDD128, ReadAfterLd], (instregex "VPGATHERDDrm")>; + +// y. +def WriteVPGATHERDD256 : SchedWriteRes<[]> { + let NumMicroOps = 34; +} +def : InstRW<[WriteVPGATHERDD256, ReadAfterLd], (instregex "VPGATHERDDYrm")>; + +// VPGATHERQD. +// x. +def WriteVPGATHERQD128 : SchedWriteRes<[]> { + let NumMicroOps = 15; +} +def : InstRW<[WriteVPGATHERQD128, ReadAfterLd], (instregex "VPGATHERQDrm")>; + +// y. +def WriteVPGATHERQD256 : SchedWriteRes<[]> { + let NumMicroOps = 22; +} +def : InstRW<[WriteVPGATHERQD256, ReadAfterLd], (instregex "VPGATHERQDYrm")>; + +// VPGATHERDQ. +// x. +def WriteVPGATHERDQ128 : SchedWriteRes<[]> { + let NumMicroOps = 12; +} +def : InstRW<[WriteVPGATHERDQ128, ReadAfterLd], (instregex "VPGATHERDQrm")>; + +// y. +def WriteVPGATHERDQ256 : SchedWriteRes<[]> { + let NumMicroOps = 20; +} +def : InstRW<[WriteVPGATHERDQ256, ReadAfterLd], (instregex "VPGATHERDQYrm")>; + +// VPGATHERQQ. +// x. +def WriteVPGATHERQQ128 : SchedWriteRes<[]> { + let NumMicroOps = 14; +} +def : InstRW<[WriteVPGATHERQQ128, ReadAfterLd], (instregex "VPGATHERQQrm")>; + +// y. +def WriteVPGATHERQQ256 : SchedWriteRes<[]> { + let NumMicroOps = 22; +} +def : InstRW<[WriteVPGATHERQQ256, ReadAfterLd], (instregex "VPGATHERQQYrm")>; + +//-- Arithmetic instructions --// + +// PHADD|PHSUB (S) W/D. +// v <- v,v. +def WritePHADDSUBr : SchedWriteRes<[HWPort1, HWPort5]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1, 2]; +} +def : InstRW<[WritePHADDSUBr], (instregex "MMX_PHADD(W?)rr64", + "MMX_PHADDSWrr64", + "MMX_PHSUB(W|D)rr64", + "MMX_PHSUBSWrr64", + "(V?)PH(ADD|SUB)(W|D)(Y?)rr", + "(V?)PH(ADD|SUB)SWrr(256)?")>; + +// v <- v,m. +def WritePHADDSUBm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [1, 2, 1]; +} +def : InstRW<[WritePHADDSUBm, ReadAfterLd], + (instregex "MMX_PHADD(W?)rm64", + "MMX_PHADDSWrm64", + "MMX_PHSUB(W|D)rm64", + "MMX_PHSUBSWrm64", + "(V?)PH(ADD|SUB)(W|D)(Y?)rm", + "(V?)PH(ADD|SUB)SWrm(128|256)?")>; + +// PCMPGTQ. +// v <- v,v. +def WritePCMPGTQr : SchedWriteRes<[HWPort0]> { + let Latency = 5; + let NumMicroOps = 1; +} +def : InstRW<[WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>; + +// v <- v,m. +def WritePCMPGTQm : SchedWriteRes<[HWPort0, HWPort23]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WritePCMPGTQm, ReadAfterLd], (instregex "(V?)PCMPGTQ(Y?)rm")>; + +// PMULLD. +// x,x / y,y,y. +def WritePMULLDr : SchedWriteRes<[HWPort0]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : InstRW<[WritePMULLDr], (instregex "(V?)PMULLD(Y?)rr")>; + +// x,m / y,y,m. +def WritePMULLDm : SchedWriteRes<[HWPort0, HWPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WritePMULLDm, ReadAfterLd], (instregex "(V?)PMULLD(Y?)rm")>; + +//-- Logic instructions --// + +// PTEST. +// v,v. +def WritePTESTr : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rr")>; + +// v,m. +def WritePTESTm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rm")>; + +// PSLL,PSRL,PSRA W/D/Q. +// x,x / v,v,x. +def WritePShift : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WritePShift], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)(Y?)rr")>; + +// PSLL,PSRL DQ. +def : InstRW<[WriteP5], (instregex "(V?)PS(R|L)LDQ(Y?)ri")>; + +//-- Other --// + +// EMMS. +def WriteEMMS : SchedWriteRes<[]> { + let Latency = 13; + let NumMicroOps = 31; +} +def : InstRW<[WriteEMMS], (instregex "MMX_EMMS")>; + +//=== Floating Point XMM and YMM Instructions ===// +//-- Move instructions --// + +// MOVMSKP S/D. +// r32 <- x. +def WriteMOVMSKPr : SchedWriteRes<[HWPort0]> { + let Latency = 3; +} +def : InstRW<[WriteMOVMSKPr], (instregex "(V?)MOVMSKP(S|D)rr")>; + +// r32 <- y. +def WriteVMOVMSKPYr : SchedWriteRes<[HWPort0]> { + let Latency = 2; +} +def : InstRW<[WriteVMOVMSKPYr], (instregex "VMOVMSKP(S|D)Yrr")>; + +// VPERM2F128. +def : InstRW<[WriteFShuffle256], (instregex "VPERM2F128rr")>; +def : InstRW<[WriteFShuffle256Ld, ReadAfterLd], (instregex "VPERM2F128rm")>; + +// BLENDVP S/D. +def : InstRW<[WriteFVarBlend], (instregex "BLENDVP(S|D)rr0")>; +def : InstRW<[WriteFVarBlendLd, ReadAfterLd], (instregex "BLENDVP(S|D)rm0")>; + +// VBROADCASTF128. +def : InstRW<[WriteLoad], (instregex "VBROADCASTF128")>; + +// EXTRACTPS. +// r32,x,i. +def WriteEXTRACTPSr : SchedWriteRes<[HWPort0, HWPort5]> { + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>; + +// m32,x,i. +def WriteEXTRACTPSm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>; + +// VEXTRACTF128. +// x,y,i. +def : InstRW<[WriteFShuffle256], (instregex "VEXTRACTF128rr")>; + +// m128,y,i. +def WriteVEXTRACTF128m : SchedWriteRes<[HWPort23, HWPort4]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteVEXTRACTF128m], (instregex "VEXTRACTF128mr")>; + +// VINSERTF128. +// y,y,x,i. +def : InstRW<[WriteFShuffle256], (instregex "VINSERTF128rr")>; + +// y,y,m128,i. +def WriteVINSERTF128m : SchedWriteRes<[HWPort015, HWPort23]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteFShuffle256, ReadAfterLd], (instregex "VINSERTF128rm")>; + +// VMASKMOVP S/D. +// v,v,m. +def WriteVMASKMOVPrm : SchedWriteRes<[HWPort5, HWPort23]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVMASKMOVPrm], (instregex "VMASKMOVP(S|D)(Y?)rm")>; + +// m128,x,x. +def WriteVMASKMOVPmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 1, 1]; +} +def : InstRW<[WriteVMASKMOVPmr], (instregex "VMASKMOVP(S|D)mr")>; + +// m256,y,y. +def WriteVMASKMOVPYmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> { + let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 1, 1]; +} +def : InstRW<[WriteVMASKMOVPYmr], (instregex "VMASKMOVP(S|D)Ymr")>; + +// VGATHERDPS. +// x. +def WriteVGATHERDPS128 : SchedWriteRes<[]> { + let NumMicroOps = 20; +} +def : InstRW<[WriteVGATHERDPS128, ReadAfterLd], (instregex "VGATHERDPSrm")>; + +// y. +def WriteVGATHERDPS256 : SchedWriteRes<[]> { + let NumMicroOps = 34; +} +def : InstRW<[WriteVGATHERDPS256, ReadAfterLd], (instregex "VGATHERDPSYrm")>; + +// VGATHERQPS. +// x. +def WriteVGATHERQPS128 : SchedWriteRes<[]> { + let NumMicroOps = 15; +} +def : InstRW<[WriteVGATHERQPS128, ReadAfterLd], (instregex "VGATHERQPSrm")>; + +// y. +def WriteVGATHERQPS256 : SchedWriteRes<[]> { + let NumMicroOps = 22; +} +def : InstRW<[WriteVGATHERQPS256, ReadAfterLd], (instregex "VGATHERQPSYrm")>; + +// VGATHERDPD. +// x. +def WriteVGATHERDPD128 : SchedWriteRes<[]> { + let NumMicroOps = 12; +} +def : InstRW<[WriteVGATHERDPD128, ReadAfterLd], (instregex "VGATHERDPDrm")>; + +// y. +def WriteVGATHERDPD256 : SchedWriteRes<[]> { + let NumMicroOps = 20; +} +def : InstRW<[WriteVGATHERDPD256, ReadAfterLd], (instregex "VGATHERDPDYrm")>; + +// VGATHERQPD. +// x. +def WriteVGATHERQPD128 : SchedWriteRes<[]> { + let NumMicroOps = 14; +} +def : InstRW<[WriteVGATHERQPD128, ReadAfterLd], (instregex "VGATHERQPDrm")>; + +// y. +def WriteVGATHERQPD256 : SchedWriteRes<[]> { + let NumMicroOps = 22; +} +def : InstRW<[WriteVGATHERQPD256, ReadAfterLd], (instregex "VGATHERQPDYrm")>; + +//-- Conversion instructions --// + +// CVTPD2PS. +// x,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVTPD2PSrr")>; + +// x,m128. +def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVTPD2PS(X?)rm")>; + +// x,y. +def WriteCVTPD2PSYrr : SchedWriteRes<[HWPort1, HWPort5]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteCVTPD2PSYrr], (instregex "(V?)CVTPD2PSYrr")>; + +// x,m256. +def WriteCVTPD2PSYrm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WriteCVTPD2PSYrm], (instregex "(V?)CVTPD2PSYrm")>; + +// CVTSD2SS. +// x,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V)?CVTSD2SSrr")>; + +// x,m64. +def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(Int_)?(V)?CVTSD2SSrm")>; + +// CVTPS2PD. +// x,x. +def WriteCVTPS2PDrr : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteCVTPS2PDrr], (instregex "(V?)CVTPS2PDrr")>; + +// x,m64. +// y,m128. +def WriteCVTPS2PDrm : SchedWriteRes<[HWPort0, HWPort23]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteCVTPS2PDrm], (instregex "(V?)CVTPS2PD(Y?)rm")>; + +// y,x. +def WriteVCVTPS2PDYrr : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteVCVTPS2PDYrr], (instregex "VCVTPS2PDYrr")>; + +// CVTSS2SD. +// x,x. +def WriteCVTSS2SDrr : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteCVTSS2SDrr], (instregex "(Int_)?(V?)CVTSS2SDrr")>; + +// x,m32. +def WriteCVTSS2SDrm : SchedWriteRes<[HWPort0, HWPort23]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteCVTSS2SDrm], (instregex "(Int_)?(V?)CVTSS2SDrm")>; + +// CVTDQ2PD. +// x,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "(V)?CVTDQ2PDrr")>; + +// y,x. +def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVTDQ2PDYrr")>; + +// CVT(T)PD2DQ. +// x,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVT(T?)PD2DQrr")>; +// x,m128. +def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVT(T?)PD2DQrm")>; +// x,y. +def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVT(T?)PD2DQYrr")>; +// x,m256. +def : InstRW<[WriteP1_P5_Lat6Ld], (instregex "VCVT(T?)PD2DQYrm")>; + +// CVT(T)PS2PI. +// mm,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PS2PIirr")>; + +// CVTPI2PD. +// x,mm. +def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PI2PDirr")>; + +// CVT(T)PD2PI. +// mm,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PD2PIirr")>; + +// CVSTSI2SS. +// x,r32. +def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V?)CVT(T?)SI2SS(64)?rr")>; + +// CVT(T)SS2SI. +// r32,x. +def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rr")>; +// r32,m32. +def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rm")>; + +// CVTSI2SD. +// x,r32/64. +def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVTSI2SS(64)?rr")>; + +// CVTSD2SI. +// r32/64 +def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rr")>; +// r32,m32. +def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rm")>; + +// VCVTPS2PH. +// x,v,i. +def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPS2PH(Y?)rr")>; +// m,v,i. +def : InstRW<[WriteP1_P5_Lat4Ld, WriteRMW], (instregex "VCVTPS2PH(Y?)mr")>; + +// VCVTPH2PS. +// v,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPH2PS(Y?)rr")>; + +//-- Arithmetic instructions --// + +// HADD, HSUB PS/PD +// x,x / v,v,v. +def WriteHADDSUBPr : SchedWriteRes<[HWPort1, HWPort5]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1, 2]; +} +def : InstRW<[WriteHADDSUBPr], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rr")>; + +// x,m / v,v,m. +def WriteHADDSUBPm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1, 2, 1]; +} +def : InstRW<[WriteHADDSUBPm], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rm")>; + +// MULL SS/SD PS/PD. +// x,x / v,v,v. +def WriteMULr : SchedWriteRes<[HWPort01]> { + let Latency = 5; +} +def : InstRW<[WriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>; + +// x,m / v,v,m. +def WriteMULm : SchedWriteRes<[HWPort01, HWPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteMULm], (instregex "(V?)MUL(P|S)(S|D)rm")>; + +// VDIVPS. +// y,y,y. +def WriteVDIVPSYrr : SchedWriteRes<[HWPort0, HWPort15]> { + let Latency = 19; // 18-21 cycles. + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVDIVPSYrr], (instregex "VDIVPSYrr")>; + +// y,y,m256. +def WriteVDIVPSYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { + let Latency = 23; // 18-21 + 4 cycles. + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteVDIVPSYrm, ReadAfterLd], (instregex "VDIVPSYrm")>; + +// VDIVPD. +// y,y,y. +def WriteVDIVPDYrr : SchedWriteRes<[HWPort0, HWPort15]> { + let Latency = 27; // 19-35 cycles. + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVDIVPDYrr], (instregex "VDIVPDYrr")>; + +// y,y,m256. +def WriteVDIVPDYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { + let Latency = 31; // 19-35 + 4 cycles. + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteVDIVPDYrm, ReadAfterLd], (instregex "VDIVPDYrm")>; + +// VRCPPS. +// y,y. +def WriteVRCPPSr : SchedWriteRes<[HWPort0, HWPort15]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVRCPPSr], (instregex "VRCPPSYr(_Int)?")>; + +// y,m256. +def WriteVRCPPSm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { + let Latency = 11; + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteVRCPPSm], (instregex "VRCPPSYm(_Int)?")>; + +// ROUND SS/SD PS/PD. +// v,v,i. +def WriteROUNDr : SchedWriteRes<[HWPort1]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : InstRW<[WriteROUNDr], (instregex "(V?)ROUND(Y?)(S|P)(S|D)r(_Int)?")>; + +// v,m,i. +def WriteROUNDm : SchedWriteRes<[HWPort1, HWPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteROUNDm], (instregex "(V?)ROUND(Y?)(S|P)(S|D)m(_Int)?")>; + +// DPPS. +// x,x,i / v,v,v,i. +def WriteDPPSr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> { + let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteDPPSr], (instregex "(V?)DPPS(Y?)rri")>; + +// x,m,i / v,v,m,i. +def WriteDPPSm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23, HWPort6]> { + let Latency = 18; + let NumMicroOps = 6; + let ResourceCycles = [2, 1, 1, 1, 1]; +} +def : InstRW<[WriteDPPSm, ReadAfterLd], (instregex "(V?)DPPS(Y?)rmi")>; + +// DPPD. +// x,x,i. +def WriteDPPDr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WriteDPPDr], (instregex "(V?)DPPDrri")>; + +// x,m,i. +def WriteDPPDm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 1, 1]; +} +def : InstRW<[WriteDPPDm], (instregex "(V?)DPPDrmi")>; + +// VFMADD. +// v,v,v. +def WriteFMADDr : SchedWriteRes<[HWPort01]> { + let Latency = 5; + let NumMicroOps = 1; +} +def : InstRW<[WriteFMADDr], + (instregex + // 3p forms. + "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)r(Y)?", + // 3s forms. + "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)r", + // 4s/4s_int forms. + "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?", + // 4p forms. + "VF(N?)M(ADD|SUB)P(S|D)4rr(Y)?(_REV)?")>; + +// v,v,m. +def WriteFMADDm : SchedWriteRes<[HWPort01, HWPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteFMADDm], + (instregex + // 3p forms. + "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)m(Y)?", + // 3s forms. + "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)m", + // 4s/4s_int forms. + "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?", + // 4p forms. + "VF(N?)M(ADD|SUB)P(S|D)4(rm|mr)(Y)?")>; + +//-- Math instructions --// + +// VSQRTPS. +// y,y. +def WriteVSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> { + let Latency = 19; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVSQRTPSYr], (instregex "VSQRTPSYr")>; + +// y,m256. +def WriteVSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { + let Latency = 23; + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteVSQRTPSYm], (instregex "VSQRTPSYm")>; + +// VSQRTPD. +// y,y. +def WriteVSQRTPDYr : SchedWriteRes<[HWPort0, HWPort15]> { + let Latency = 28; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVSQRTPDYr], (instregex "VSQRTPDYr")>; + +// y,m256. +def WriteVSQRTPDYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { + let Latency = 32; + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteVSQRTPDYm], (instregex "VSQRTPDYm")>; + +// RSQRT SS/PS. +// x,x. +def WriteRSQRTr : SchedWriteRes<[HWPort0]> { + let Latency = 5; +} +def : InstRW<[WriteRSQRTr], (instregex "(V?)RSQRT(SS|PS)r(_Int)?")>; + +// x,m128. +def WriteRSQRTm : SchedWriteRes<[HWPort0, HWPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteRSQRTm], (instregex "(V?)RSQRT(SS|PS)m(_Int)?")>; + +// RSQRTPS 256. +// y,y. +def WriteRSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteRSQRTPSYr], (instregex "VRSQRTPSYr(_Int)?")>; + +// y,m256. +def WriteRSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { + let Latency = 11; + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteRSQRTPSYm], (instregex "VRSQRTPSYm(_Int)?")>; + +//-- Logic instructions --// + +// AND, ANDN, OR, XOR PS/PD. +// x,x / v,v,v. +def : InstRW<[WriteP5], (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rr")>; +// x,m / v,v,m. +def : InstRW<[WriteP5Ld, ReadAfterLd], + (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rm")>; + +//-- Other instructions --// + +// VZEROUPPER. +def WriteVZEROUPPER : SchedWriteRes<[]> { + let NumMicroOps = 4; +} +def : InstRW<[WriteVZEROUPPER], (instregex "VZEROUPPER")>; + +// VZEROALL. +def WriteVZEROALL : SchedWriteRes<[]> { + let NumMicroOps = 12; +} +def : InstRW<[WriteVZEROALL], (instregex "VZEROALL")>; + +// LDMXCSR. +def WriteLDMXCSR : SchedWriteRes<[HWPort0, HWPort6, HWPort23]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WriteLDMXCSR], (instregex "(V)?LDMXCSR")>; + +// STMXCSR. +def WriteSTMXCSR : SchedWriteRes<[HWPort0, HWPort4, HWPort6, HWPort237]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 1, 1]; +} +def : InstRW<[WriteSTMXCSR], (instregex "(V)?STMXCSR")>; + +} // SchedModel diff --git a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td new file mode 100644 index 0000000..eca65c2 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -0,0 +1,250 @@ +//=- X86SchedSandyBridge.td - X86 Sandy Bridge Scheduling ----*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Sandy Bridge to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def SandyBridgeModel : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and SB can decode 4 + // instructions per cycle. + // FIXME: Identify instructions that aren't a single fused micro-op. + let IssueWidth = 4; + let MicroOpBufferSize = 168; // Based on the reorder buffer. + let LoadLatency = 4; + let MispredictPenalty = 16; + + // Based on the LSD (loop-stream detector) queue size. + let LoopMicroOpBufferSize = 28; + + // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow + // the scheduler to assign a default model to unrecognized opcodes. + let CompleteModel = 0; +} + +let SchedModel = SandyBridgeModel in { + +// Sandy Bridge can issue micro-ops to 6 different ports in one cycle. + +// Ports 0, 1, and 5 handle all computation. +def SBPort0 : ProcResource<1>; +def SBPort1 : ProcResource<1>; +def SBPort5 : ProcResource<1>; + +// Ports 2 and 3 are identical. They handle loads and the address half of +// stores. +def SBPort23 : ProcResource<2>; + +// Port 4 gets the data half of stores. Store data can be available later than +// the store address, but since we don't model the latency of stores, we can +// ignore that. +def SBPort4 : ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def SBPort05 : ProcResGroup<[SBPort0, SBPort5]>; +def SBPort15 : ProcResGroup<[SBPort1, SBPort5]>; +def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>; + +// 54 Entry Unified Scheduler +def SBPortAny : ProcResGroup<[SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]> { + let BufferSize=54; +} + +// Integer division issued on port 0. +def SBDivider : ProcResource<1>; + +// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 4>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass SBWriteResPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, [SBPort23, ExePort]> { + let Latency = !add(Lat, 4); + } +} + +// A folded store needs a cycle on port 4 for the store data, but it does not +// need an extra port 2/3 cycle to recompute the address. +def : WriteRes<WriteRMW, [SBPort4]>; + +def : WriteRes<WriteStore, [SBPort23, SBPort4]>; +def : WriteRes<WriteLoad, [SBPort23]> { let Latency = 4; } +def : WriteRes<WriteMove, [SBPort015]>; +def : WriteRes<WriteZero, []>; + +defm : SBWriteResPair<WriteALU, SBPort015, 1>; +defm : SBWriteResPair<WriteIMul, SBPort1, 3>; +def : WriteRes<WriteIMulH, []> { let Latency = 3; } +defm : SBWriteResPair<WriteShift, SBPort05, 1>; +defm : SBWriteResPair<WriteJump, SBPort5, 1>; + +// This is for simple LEAs with one or two input operands. +// The complex ones can only execute on port 1, and they require two cycles on +// the port to read all inputs. We don't model that. +def : WriteRes<WriteLEA, [SBPort15]>; + +// This is quite rough, latency depends on the dividend. +def : WriteRes<WriteIDiv, [SBPort0, SBDivider]> { + let Latency = 25; + let ResourceCycles = [1, 10]; +} +def : WriteRes<WriteIDivLd, [SBPort23, SBPort0, SBDivider]> { + let Latency = 29; + let ResourceCycles = [1, 1, 10]; +} + +// Scalar and vector floating point. +defm : SBWriteResPair<WriteFAdd, SBPort1, 3>; +defm : SBWriteResPair<WriteFMul, SBPort0, 5>; +defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles. +defm : SBWriteResPair<WriteFRcp, SBPort0, 5>; +defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>; +defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>; +defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>; +defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>; +defm : SBWriteResPair<WriteCvtF2F, SBPort1, 3>; +defm : SBWriteResPair<WriteFShuffle, SBPort5, 1>; +defm : SBWriteResPair<WriteFBlend, SBPort05, 1>; +def : WriteRes<WriteFVarBlend, [SBPort0, SBPort5]> { + let Latency = 2; + let ResourceCycles = [1, 1]; +} +def : WriteRes<WriteFVarBlendLd, [SBPort0, SBPort5, SBPort23]> { + let Latency = 6; + let ResourceCycles = [1, 1, 1]; +} + +// Vector integer operations. +defm : SBWriteResPair<WriteVecShift, SBPort05, 1>; +defm : SBWriteResPair<WriteVecLogic, SBPort015, 1>; +defm : SBWriteResPair<WriteVecALU, SBPort15, 1>; +defm : SBWriteResPair<WriteVecIMul, SBPort0, 5>; +defm : SBWriteResPair<WriteShuffle, SBPort15, 1>; +defm : SBWriteResPair<WriteBlend, SBPort15, 1>; +def : WriteRes<WriteVarBlend, [SBPort1, SBPort5]> { + let Latency = 2; + let ResourceCycles = [1, 1]; +} +def : WriteRes<WriteVarBlendLd, [SBPort1, SBPort5, SBPort23]> { + let Latency = 6; + let ResourceCycles = [1, 1, 1]; +} +def : WriteRes<WriteMPSAD, [SBPort0, SBPort1, SBPort5]> { + let Latency = 6; + let ResourceCycles = [1, 1, 1]; +} +def : WriteRes<WriteMPSADLd, [SBPort0, SBPort1, SBPort5, SBPort23]> { + let Latency = 6; + let ResourceCycles = [1, 1, 1, 1]; +} + +// String instructions. +// Packed Compare Implicit Length Strings, Return Mask +def : WriteRes<WritePCmpIStrM, [SBPort015]> { + let Latency = 11; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrMLd, [SBPort015, SBPort23]> { + let Latency = 11; + let ResourceCycles = [3, 1]; +} + +// Packed Compare Explicit Length Strings, Return Mask +def : WriteRes<WritePCmpEStrM, [SBPort015]> { + let Latency = 11; + let ResourceCycles = [8]; +} +def : WriteRes<WritePCmpEStrMLd, [SBPort015, SBPort23]> { + let Latency = 11; + let ResourceCycles = [7, 1]; +} + +// Packed Compare Implicit Length Strings, Return Index +def : WriteRes<WritePCmpIStrI, [SBPort015]> { + let Latency = 3; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrILd, [SBPort015, SBPort23]> { + let Latency = 3; + let ResourceCycles = [3, 1]; +} + +// Packed Compare Explicit Length Strings, Return Index +def : WriteRes<WritePCmpEStrI, [SBPort015]> { + let Latency = 4; + let ResourceCycles = [8]; +} +def : WriteRes<WritePCmpEStrILd, [SBPort015, SBPort23]> { + let Latency = 4; + let ResourceCycles = [7, 1]; +} + +// AES Instructions. +def : WriteRes<WriteAESDecEnc, [SBPort015]> { + let Latency = 8; + let ResourceCycles = [2]; +} +def : WriteRes<WriteAESDecEncLd, [SBPort015, SBPort23]> { + let Latency = 8; + let ResourceCycles = [2, 1]; +} + +def : WriteRes<WriteAESIMC, [SBPort015]> { + let Latency = 8; + let ResourceCycles = [2]; +} +def : WriteRes<WriteAESIMCLd, [SBPort015, SBPort23]> { + let Latency = 8; + let ResourceCycles = [2, 1]; +} + +def : WriteRes<WriteAESKeyGen, [SBPort015]> { + let Latency = 8; + let ResourceCycles = [11]; +} +def : WriteRes<WriteAESKeyGenLd, [SBPort015, SBPort23]> { + let Latency = 8; + let ResourceCycles = [10, 1]; +} + +// Carry-less multiplication instructions. +def : WriteRes<WriteCLMul, [SBPort015]> { + let Latency = 14; + let ResourceCycles = [18]; +} +def : WriteRes<WriteCLMulLd, [SBPort015, SBPort23]> { + let Latency = 14; + let ResourceCycles = [17, 1]; +} + + +def : WriteRes<WriteSystem, [SBPort015]> { let Latency = 100; } +def : WriteRes<WriteMicrocoded, [SBPort015]> { let Latency = 100; } +def : WriteRes<WriteFence, [SBPort23, SBPort4]>; +def : WriteRes<WriteNop, []>; + +// AVX2 is not supported on that architecture, but we should define the basic +// scheduling resources anyway. +defm : SBWriteResPair<WriteFShuffle256, SBPort0, 1>; +defm : SBWriteResPair<WriteShuffle256, SBPort0, 1>; +defm : SBWriteResPair<WriteVarVecShift, SBPort0, 1>; +} // SchedModel diff --git a/contrib/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm/lib/Target/X86/X86Schedule.td new file mode 100644 index 0000000..a261356 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86Schedule.td @@ -0,0 +1,650 @@ +//===-- X86Schedule.td - X86 Scheduling Definitions --------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// InstrSchedModel annotations for out-of-order CPUs. +// +// These annotations are independent of the itinerary classes defined below. + +// Instructions with folded loads need to read the memory operand immediately, +// but other register operands don't have to be read until the load is ready. +// These operands are marked with ReadAfterLd. +def ReadAfterLd : SchedRead; + +// Instructions with both a load and a store folded are modeled as a folded +// load + WriteRMW. +def WriteRMW : SchedWrite; + +// Most instructions can fold loads, so almost every SchedWrite comes in two +// variants: With and without a folded load. +// An X86FoldableSchedWrite holds a reference to the corresponding SchedWrite +// with a folded load. +class X86FoldableSchedWrite : SchedWrite { + // The SchedWrite to use when a load is folded into the instruction. + SchedWrite Folded; +} + +// Multiclass that produces a linked pair of SchedWrites. +multiclass X86SchedWritePair { + // Register-Memory operation. + def Ld : SchedWrite; + // Register-Register operation. + def NAME : X86FoldableSchedWrite { + let Folded = !cast<SchedWrite>(NAME#"Ld"); + } +} + +// Arithmetic. +defm WriteALU : X86SchedWritePair; // Simple integer ALU op. +defm WriteIMul : X86SchedWritePair; // Integer multiplication. +def WriteIMulH : SchedWrite; // Integer multiplication, high part. +defm WriteIDiv : X86SchedWritePair; // Integer division. +def WriteLEA : SchedWrite; // LEA instructions can't fold loads. + +// Integer shifts and rotates. +defm WriteShift : X86SchedWritePair; + +// Loads, stores, and moves, not folded with other operations. +def WriteLoad : SchedWrite; +def WriteStore : SchedWrite; +def WriteMove : SchedWrite; + +// Idioms that clear a register, like xorps %xmm0, %xmm0. +// These can often bypass execution ports completely. +def WriteZero : SchedWrite; + +// Branches don't produce values, so they have no latency, but they still +// consume resources. Indirect branches can fold loads. +defm WriteJump : X86SchedWritePair; + +// Floating point. This covers both scalar and vector operations. +defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare. +defm WriteFMul : X86SchedWritePair; // Floating point multiplication. +defm WriteFDiv : X86SchedWritePair; // Floating point division. +defm WriteFSqrt : X86SchedWritePair; // Floating point square root. +defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal estimate. +defm WriteFRsqrt : X86SchedWritePair; // Floating point reciprocal square root estimate. +defm WriteFMA : X86SchedWritePair; // Fused Multiply Add. +defm WriteFShuffle : X86SchedWritePair; // Floating point vector shuffles. +defm WriteFBlend : X86SchedWritePair; // Floating point vector blends. +defm WriteFVarBlend : X86SchedWritePair; // Fp vector variable blends. + +// FMA Scheduling helper class. +class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } + +// Vector integer operations. +defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals. +defm WriteVecShift : X86SchedWritePair; // Vector integer shifts. +defm WriteVecIMul : X86SchedWritePair; // Vector integer multiply. +defm WriteShuffle : X86SchedWritePair; // Vector shuffles. +defm WriteBlend : X86SchedWritePair; // Vector blends. +defm WriteVarBlend : X86SchedWritePair; // Vector variable blends. +defm WriteMPSAD : X86SchedWritePair; // Vector MPSAD. + +// Vector bitwise operations. +// These are often used on both floating point and integer vectors. +defm WriteVecLogic : X86SchedWritePair; // Vector and/or/xor. + +// Conversion between integer and float. +defm WriteCvtF2I : X86SchedWritePair; // Float -> Integer. +defm WriteCvtI2F : X86SchedWritePair; // Integer -> Float. +defm WriteCvtF2F : X86SchedWritePair; // Float -> Float size conversion. + +// Strings instructions. +// Packed Compare Implicit Length Strings, Return Mask +defm WritePCmpIStrM : X86SchedWritePair; +// Packed Compare Explicit Length Strings, Return Mask +defm WritePCmpEStrM : X86SchedWritePair; +// Packed Compare Implicit Length Strings, Return Index +defm WritePCmpIStrI : X86SchedWritePair; +// Packed Compare Explicit Length Strings, Return Index +defm WritePCmpEStrI : X86SchedWritePair; + +// AES instructions. +defm WriteAESDecEnc : X86SchedWritePair; // Decryption, encryption. +defm WriteAESIMC : X86SchedWritePair; // InvMixColumn. +defm WriteAESKeyGen : X86SchedWritePair; // Key Generation. + +// Carry-less multiplication instructions. +defm WriteCLMul : X86SchedWritePair; + +// Catch-all for expensive system instructions. +def WriteSystem : SchedWrite; + +// AVX2. +defm WriteFShuffle256 : X86SchedWritePair; // Fp 256-bit width vector shuffles. +defm WriteShuffle256 : X86SchedWritePair; // 256-bit width vector shuffles. +defm WriteVarVecShift : X86SchedWritePair; // Variable vector shifts. + +// Old microcoded instructions that nobody use. +def WriteMicrocoded : SchedWrite; + +// Fence instructions. +def WriteFence : SchedWrite; + +// Nop, not very useful expect it provides a model for nops! +def WriteNop : SchedWrite; + +//===----------------------------------------------------------------------===// +// Instruction Itinerary classes used for X86 +def IIC_ALU_MEM : InstrItinClass; +def IIC_ALU_NONMEM : InstrItinClass; +def IIC_LEA : InstrItinClass; +def IIC_LEA_16 : InstrItinClass; +def IIC_MUL8 : InstrItinClass; +def IIC_MUL16_MEM : InstrItinClass; +def IIC_MUL16_REG : InstrItinClass; +def IIC_MUL32_MEM : InstrItinClass; +def IIC_MUL32_REG : InstrItinClass; +def IIC_MUL64 : InstrItinClass; +// imul by al, ax, eax, tax +def IIC_IMUL8 : InstrItinClass; +def IIC_IMUL16_MEM : InstrItinClass; +def IIC_IMUL16_REG : InstrItinClass; +def IIC_IMUL32_MEM : InstrItinClass; +def IIC_IMUL32_REG : InstrItinClass; +def IIC_IMUL64 : InstrItinClass; +// imul reg by reg|mem +def IIC_IMUL16_RM : InstrItinClass; +def IIC_IMUL16_RR : InstrItinClass; +def IIC_IMUL32_RM : InstrItinClass; +def IIC_IMUL32_RR : InstrItinClass; +def IIC_IMUL64_RM : InstrItinClass; +def IIC_IMUL64_RR : InstrItinClass; +// imul reg = reg/mem * imm +def IIC_IMUL16_RMI : InstrItinClass; +def IIC_IMUL16_RRI : InstrItinClass; +def IIC_IMUL32_RMI : InstrItinClass; +def IIC_IMUL32_RRI : InstrItinClass; +def IIC_IMUL64_RMI : InstrItinClass; +def IIC_IMUL64_RRI : InstrItinClass; +// div +def IIC_DIV8_MEM : InstrItinClass; +def IIC_DIV8_REG : InstrItinClass; +def IIC_DIV16 : InstrItinClass; +def IIC_DIV32 : InstrItinClass; +def IIC_DIV64 : InstrItinClass; +// idiv +def IIC_IDIV8 : InstrItinClass; +def IIC_IDIV16 : InstrItinClass; +def IIC_IDIV32 : InstrItinClass; +def IIC_IDIV64 : InstrItinClass; +// neg/not/inc/dec +def IIC_UNARY_REG : InstrItinClass; +def IIC_UNARY_MEM : InstrItinClass; +// add/sub/and/or/xor/sbc/cmp/test +def IIC_BIN_MEM : InstrItinClass; +def IIC_BIN_NONMEM : InstrItinClass; +// adc/sbc +def IIC_BIN_CARRY_MEM : InstrItinClass; +def IIC_BIN_CARRY_NONMEM : InstrItinClass; +// shift/rotate +def IIC_SR : InstrItinClass; +// shift double +def IIC_SHD16_REG_IM : InstrItinClass; +def IIC_SHD16_REG_CL : InstrItinClass; +def IIC_SHD16_MEM_IM : InstrItinClass; +def IIC_SHD16_MEM_CL : InstrItinClass; +def IIC_SHD32_REG_IM : InstrItinClass; +def IIC_SHD32_REG_CL : InstrItinClass; +def IIC_SHD32_MEM_IM : InstrItinClass; +def IIC_SHD32_MEM_CL : InstrItinClass; +def IIC_SHD64_REG_IM : InstrItinClass; +def IIC_SHD64_REG_CL : InstrItinClass; +def IIC_SHD64_MEM_IM : InstrItinClass; +def IIC_SHD64_MEM_CL : InstrItinClass; +// cmov +def IIC_CMOV16_RM : InstrItinClass; +def IIC_CMOV16_RR : InstrItinClass; +def IIC_CMOV32_RM : InstrItinClass; +def IIC_CMOV32_RR : InstrItinClass; +def IIC_CMOV64_RM : InstrItinClass; +def IIC_CMOV64_RR : InstrItinClass; +// set +def IIC_SET_R : InstrItinClass; +def IIC_SET_M : InstrItinClass; +// jmp/jcc/jcxz +def IIC_Jcc : InstrItinClass; +def IIC_JCXZ : InstrItinClass; +def IIC_JMP_REL : InstrItinClass; +def IIC_JMP_REG : InstrItinClass; +def IIC_JMP_MEM : InstrItinClass; +def IIC_JMP_FAR_MEM : InstrItinClass; +def IIC_JMP_FAR_PTR : InstrItinClass; +// loop +def IIC_LOOP : InstrItinClass; +def IIC_LOOPE : InstrItinClass; +def IIC_LOOPNE : InstrItinClass; +// call +def IIC_CALL_RI : InstrItinClass; +def IIC_CALL_MEM : InstrItinClass; +def IIC_CALL_FAR_MEM : InstrItinClass; +def IIC_CALL_FAR_PTR : InstrItinClass; +// ret +def IIC_RET : InstrItinClass; +def IIC_RET_IMM : InstrItinClass; +//sign extension movs +def IIC_MOVSX : InstrItinClass; +def IIC_MOVSX_R16_R8 : InstrItinClass; +def IIC_MOVSX_R16_M8 : InstrItinClass; +def IIC_MOVSX_R16_R16 : InstrItinClass; +def IIC_MOVSX_R32_R32 : InstrItinClass; +//zero extension movs +def IIC_MOVZX : InstrItinClass; +def IIC_MOVZX_R16_R8 : InstrItinClass; +def IIC_MOVZX_R16_M8 : InstrItinClass; + +def IIC_REP_MOVS : InstrItinClass; +def IIC_REP_STOS : InstrItinClass; + +// SSE scalar/parallel binary operations +def IIC_SSE_ALU_F32S_RR : InstrItinClass; +def IIC_SSE_ALU_F32S_RM : InstrItinClass; +def IIC_SSE_ALU_F64S_RR : InstrItinClass; +def IIC_SSE_ALU_F64S_RM : InstrItinClass; +def IIC_SSE_MUL_F32S_RR : InstrItinClass; +def IIC_SSE_MUL_F32S_RM : InstrItinClass; +def IIC_SSE_MUL_F64S_RR : InstrItinClass; +def IIC_SSE_MUL_F64S_RM : InstrItinClass; +def IIC_SSE_DIV_F32S_RR : InstrItinClass; +def IIC_SSE_DIV_F32S_RM : InstrItinClass; +def IIC_SSE_DIV_F64S_RR : InstrItinClass; +def IIC_SSE_DIV_F64S_RM : InstrItinClass; +def IIC_SSE_ALU_F32P_RR : InstrItinClass; +def IIC_SSE_ALU_F32P_RM : InstrItinClass; +def IIC_SSE_ALU_F64P_RR : InstrItinClass; +def IIC_SSE_ALU_F64P_RM : InstrItinClass; +def IIC_SSE_MUL_F32P_RR : InstrItinClass; +def IIC_SSE_MUL_F32P_RM : InstrItinClass; +def IIC_SSE_MUL_F64P_RR : InstrItinClass; +def IIC_SSE_MUL_F64P_RM : InstrItinClass; +def IIC_SSE_DIV_F32P_RR : InstrItinClass; +def IIC_SSE_DIV_F32P_RM : InstrItinClass; +def IIC_SSE_DIV_F64P_RR : InstrItinClass; +def IIC_SSE_DIV_F64P_RM : InstrItinClass; + +def IIC_SSE_COMIS_RR : InstrItinClass; +def IIC_SSE_COMIS_RM : InstrItinClass; + +def IIC_SSE_HADDSUB_RR : InstrItinClass; +def IIC_SSE_HADDSUB_RM : InstrItinClass; + +def IIC_SSE_BIT_P_RR : InstrItinClass; +def IIC_SSE_BIT_P_RM : InstrItinClass; + +def IIC_SSE_INTALU_P_RR : InstrItinClass; +def IIC_SSE_INTALU_P_RM : InstrItinClass; +def IIC_SSE_INTALUQ_P_RR : InstrItinClass; +def IIC_SSE_INTALUQ_P_RM : InstrItinClass; + +def IIC_SSE_INTMUL_P_RR : InstrItinClass; +def IIC_SSE_INTMUL_P_RM : InstrItinClass; + +def IIC_SSE_INTSH_P_RR : InstrItinClass; +def IIC_SSE_INTSH_P_RM : InstrItinClass; +def IIC_SSE_INTSH_P_RI : InstrItinClass; + +def IIC_SSE_INTSHDQ_P_RI : InstrItinClass; + +def IIC_SSE_SHUFP : InstrItinClass; +def IIC_SSE_PSHUF_RI : InstrItinClass; +def IIC_SSE_PSHUF_MI : InstrItinClass; + +def IIC_SSE_UNPCK : InstrItinClass; + +def IIC_SSE_MOVMSK : InstrItinClass; +def IIC_SSE_MASKMOV : InstrItinClass; + +def IIC_SSE_PEXTRW : InstrItinClass; +def IIC_SSE_PINSRW : InstrItinClass; + +def IIC_SSE_PABS_RR : InstrItinClass; +def IIC_SSE_PABS_RM : InstrItinClass; + +def IIC_SSE_SQRTPS_RR : InstrItinClass; +def IIC_SSE_SQRTPS_RM : InstrItinClass; +def IIC_SSE_SQRTSS_RR : InstrItinClass; +def IIC_SSE_SQRTSS_RM : InstrItinClass; +def IIC_SSE_SQRTPD_RR : InstrItinClass; +def IIC_SSE_SQRTPD_RM : InstrItinClass; +def IIC_SSE_SQRTSD_RR : InstrItinClass; +def IIC_SSE_SQRTSD_RM : InstrItinClass; + +def IIC_SSE_RSQRTPS_RR : InstrItinClass; +def IIC_SSE_RSQRTPS_RM : InstrItinClass; +def IIC_SSE_RSQRTSS_RR : InstrItinClass; +def IIC_SSE_RSQRTSS_RM : InstrItinClass; + +def IIC_SSE_RCPP_RR : InstrItinClass; +def IIC_SSE_RCPP_RM : InstrItinClass; +def IIC_SSE_RCPS_RR : InstrItinClass; +def IIC_SSE_RCPS_RM : InstrItinClass; + +def IIC_SSE_MOV_S_RR : InstrItinClass; +def IIC_SSE_MOV_S_RM : InstrItinClass; +def IIC_SSE_MOV_S_MR : InstrItinClass; + +def IIC_SSE_MOVA_P_RR : InstrItinClass; +def IIC_SSE_MOVA_P_RM : InstrItinClass; +def IIC_SSE_MOVA_P_MR : InstrItinClass; + +def IIC_SSE_MOVU_P_RR : InstrItinClass; +def IIC_SSE_MOVU_P_RM : InstrItinClass; +def IIC_SSE_MOVU_P_MR : InstrItinClass; + +def IIC_SSE_MOVDQ : InstrItinClass; +def IIC_SSE_MOVD_ToGP : InstrItinClass; +def IIC_SSE_MOVQ_RR : InstrItinClass; + +def IIC_SSE_MOV_LH : InstrItinClass; + +def IIC_SSE_LDDQU : InstrItinClass; + +def IIC_SSE_MOVNT : InstrItinClass; + +def IIC_SSE_PHADDSUBD_RR : InstrItinClass; +def IIC_SSE_PHADDSUBD_RM : InstrItinClass; +def IIC_SSE_PHADDSUBSW_RR : InstrItinClass; +def IIC_SSE_PHADDSUBSW_RM : InstrItinClass; +def IIC_SSE_PHADDSUBW_RR : InstrItinClass; +def IIC_SSE_PHADDSUBW_RM : InstrItinClass; +def IIC_SSE_PSHUFB_RR : InstrItinClass; +def IIC_SSE_PSHUFB_RM : InstrItinClass; +def IIC_SSE_PSIGN_RR : InstrItinClass; +def IIC_SSE_PSIGN_RM : InstrItinClass; + +def IIC_SSE_PMADD : InstrItinClass; +def IIC_SSE_PMULHRSW : InstrItinClass; +def IIC_SSE_PALIGNRR : InstrItinClass; +def IIC_SSE_PALIGNRM : InstrItinClass; +def IIC_SSE_MWAIT : InstrItinClass; +def IIC_SSE_MONITOR : InstrItinClass; + +def IIC_SSE_PREFETCH : InstrItinClass; +def IIC_SSE_PAUSE : InstrItinClass; +def IIC_SSE_LFENCE : InstrItinClass; +def IIC_SSE_MFENCE : InstrItinClass; +def IIC_SSE_SFENCE : InstrItinClass; +def IIC_SSE_LDMXCSR : InstrItinClass; +def IIC_SSE_STMXCSR : InstrItinClass; + +def IIC_SSE_CVT_PD_RR : InstrItinClass; +def IIC_SSE_CVT_PD_RM : InstrItinClass; +def IIC_SSE_CVT_PS_RR : InstrItinClass; +def IIC_SSE_CVT_PS_RM : InstrItinClass; +def IIC_SSE_CVT_PI2PS_RR : InstrItinClass; +def IIC_SSE_CVT_PI2PS_RM : InstrItinClass; +def IIC_SSE_CVT_Scalar_RR : InstrItinClass; +def IIC_SSE_CVT_Scalar_RM : InstrItinClass; +def IIC_SSE_CVT_SS2SI32_RM : InstrItinClass; +def IIC_SSE_CVT_SS2SI32_RR : InstrItinClass; +def IIC_SSE_CVT_SS2SI64_RM : InstrItinClass; +def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass; +def IIC_SSE_CVT_SD2SI_RM : InstrItinClass; +def IIC_SSE_CVT_SD2SI_RR : InstrItinClass; + +// MMX +def IIC_MMX_MOV_MM_RM : InstrItinClass; +def IIC_MMX_MOV_REG_MM : InstrItinClass; +def IIC_MMX_MOVQ_RM : InstrItinClass; +def IIC_MMX_MOVQ_RR : InstrItinClass; + +def IIC_MMX_ALU_RM : InstrItinClass; +def IIC_MMX_ALU_RR : InstrItinClass; +def IIC_MMX_ALUQ_RM : InstrItinClass; +def IIC_MMX_ALUQ_RR : InstrItinClass; +def IIC_MMX_PHADDSUBW_RM : InstrItinClass; +def IIC_MMX_PHADDSUBW_RR : InstrItinClass; +def IIC_MMX_PHADDSUBD_RM : InstrItinClass; +def IIC_MMX_PHADDSUBD_RR : InstrItinClass; +def IIC_MMX_PMUL : InstrItinClass; +def IIC_MMX_MISC_FUNC_MEM : InstrItinClass; +def IIC_MMX_MISC_FUNC_REG : InstrItinClass; +def IIC_MMX_PSADBW : InstrItinClass; +def IIC_MMX_SHIFT_RI : InstrItinClass; +def IIC_MMX_SHIFT_RM : InstrItinClass; +def IIC_MMX_SHIFT_RR : InstrItinClass; +def IIC_MMX_UNPCK_H_RM : InstrItinClass; +def IIC_MMX_UNPCK_H_RR : InstrItinClass; +def IIC_MMX_UNPCK_L : InstrItinClass; +def IIC_MMX_PCK_RM : InstrItinClass; +def IIC_MMX_PCK_RR : InstrItinClass; +def IIC_MMX_PSHUF : InstrItinClass; +def IIC_MMX_PEXTR : InstrItinClass; +def IIC_MMX_PINSRW : InstrItinClass; +def IIC_MMX_MASKMOV : InstrItinClass; + +def IIC_MMX_CVT_PD_RR : InstrItinClass; +def IIC_MMX_CVT_PD_RM : InstrItinClass; +def IIC_MMX_CVT_PS_RR : InstrItinClass; +def IIC_MMX_CVT_PS_RM : InstrItinClass; + +def IIC_CMPX_LOCK : InstrItinClass; +def IIC_CMPX_LOCK_8 : InstrItinClass; +def IIC_CMPX_LOCK_8B : InstrItinClass; +def IIC_CMPX_LOCK_16B : InstrItinClass; + +def IIC_XADD_LOCK_MEM : InstrItinClass; +def IIC_XADD_LOCK_MEM8 : InstrItinClass; + +def IIC_FILD : InstrItinClass; +def IIC_FLD : InstrItinClass; +def IIC_FLD80 : InstrItinClass; +def IIC_FST : InstrItinClass; +def IIC_FST80 : InstrItinClass; +def IIC_FIST : InstrItinClass; +def IIC_FLDZ : InstrItinClass; +def IIC_FUCOM : InstrItinClass; +def IIC_FUCOMI : InstrItinClass; +def IIC_FCOMI : InstrItinClass; +def IIC_FNSTSW : InstrItinClass; +def IIC_FNSTCW : InstrItinClass; +def IIC_FLDCW : InstrItinClass; +def IIC_FNINIT : InstrItinClass; +def IIC_FFREE : InstrItinClass; +def IIC_FNCLEX : InstrItinClass; +def IIC_WAIT : InstrItinClass; +def IIC_FXAM : InstrItinClass; +def IIC_FNOP : InstrItinClass; +def IIC_FLDL : InstrItinClass; +def IIC_F2XM1 : InstrItinClass; +def IIC_FYL2X : InstrItinClass; +def IIC_FPTAN : InstrItinClass; +def IIC_FPATAN : InstrItinClass; +def IIC_FXTRACT : InstrItinClass; +def IIC_FPREM1 : InstrItinClass; +def IIC_FPSTP : InstrItinClass; +def IIC_FPREM : InstrItinClass; +def IIC_FYL2XP1 : InstrItinClass; +def IIC_FSINCOS : InstrItinClass; +def IIC_FRNDINT : InstrItinClass; +def IIC_FSCALE : InstrItinClass; +def IIC_FCOMPP : InstrItinClass; +def IIC_FXSAVE : InstrItinClass; +def IIC_FXRSTOR : InstrItinClass; + +def IIC_FXCH : InstrItinClass; + +// System instructions +def IIC_CPUID : InstrItinClass; +def IIC_INT : InstrItinClass; +def IIC_INT3 : InstrItinClass; +def IIC_INVD : InstrItinClass; +def IIC_INVLPG : InstrItinClass; +def IIC_IRET : InstrItinClass; +def IIC_HLT : InstrItinClass; +def IIC_LXS : InstrItinClass; +def IIC_LTR : InstrItinClass; +def IIC_RDTSC : InstrItinClass; +def IIC_RSM : InstrItinClass; +def IIC_SIDT : InstrItinClass; +def IIC_SGDT : InstrItinClass; +def IIC_SLDT : InstrItinClass; +def IIC_STR : InstrItinClass; +def IIC_SWAPGS : InstrItinClass; +def IIC_SYSCALL : InstrItinClass; +def IIC_SYS_ENTER_EXIT : InstrItinClass; +def IIC_IN_RR : InstrItinClass; +def IIC_IN_RI : InstrItinClass; +def IIC_OUT_RR : InstrItinClass; +def IIC_OUT_IR : InstrItinClass; +def IIC_INS : InstrItinClass; +def IIC_MOV_REG_DR : InstrItinClass; +def IIC_MOV_DR_REG : InstrItinClass; +def IIC_MOV_REG_CR : InstrItinClass; +def IIC_MOV_CR_REG : InstrItinClass; +def IIC_MOV_REG_SR : InstrItinClass; +def IIC_MOV_MEM_SR : InstrItinClass; +def IIC_MOV_SR_REG : InstrItinClass; +def IIC_MOV_SR_MEM : InstrItinClass; +def IIC_LAR_RM : InstrItinClass; +def IIC_LAR_RR : InstrItinClass; +def IIC_LSL_RM : InstrItinClass; +def IIC_LSL_RR : InstrItinClass; +def IIC_LGDT : InstrItinClass; +def IIC_LIDT : InstrItinClass; +def IIC_LLDT_REG : InstrItinClass; +def IIC_LLDT_MEM : InstrItinClass; +def IIC_PUSH_CS : InstrItinClass; +def IIC_PUSH_SR : InstrItinClass; +def IIC_POP_SR : InstrItinClass; +def IIC_POP_SR_SS : InstrItinClass; +def IIC_VERR : InstrItinClass; +def IIC_VERW_REG : InstrItinClass; +def IIC_VERW_MEM : InstrItinClass; +def IIC_WRMSR : InstrItinClass; +def IIC_RDMSR : InstrItinClass; +def IIC_RDPMC : InstrItinClass; +def IIC_SMSW : InstrItinClass; +def IIC_LMSW_REG : InstrItinClass; +def IIC_LMSW_MEM : InstrItinClass; +def IIC_ENTER : InstrItinClass; +def IIC_LEAVE : InstrItinClass; +def IIC_POP_MEM : InstrItinClass; +def IIC_POP_REG16 : InstrItinClass; +def IIC_POP_REG : InstrItinClass; +def IIC_POP_F : InstrItinClass; +def IIC_POP_FD : InstrItinClass; +def IIC_POP_A : InstrItinClass; +def IIC_PUSH_IMM : InstrItinClass; +def IIC_PUSH_MEM : InstrItinClass; +def IIC_PUSH_REG : InstrItinClass; +def IIC_PUSH_F : InstrItinClass; +def IIC_PUSH_A : InstrItinClass; +def IIC_BSWAP : InstrItinClass; +def IIC_BIT_SCAN_MEM : InstrItinClass; +def IIC_BIT_SCAN_REG : InstrItinClass; +def IIC_MOVS : InstrItinClass; +def IIC_STOS : InstrItinClass; +def IIC_SCAS : InstrItinClass; +def IIC_CMPS : InstrItinClass; +def IIC_MOV : InstrItinClass; +def IIC_MOV_MEM : InstrItinClass; +def IIC_AHF : InstrItinClass; +def IIC_BT_MI : InstrItinClass; +def IIC_BT_MR : InstrItinClass; +def IIC_BT_RI : InstrItinClass; +def IIC_BT_RR : InstrItinClass; +def IIC_BTX_MI : InstrItinClass; +def IIC_BTX_MR : InstrItinClass; +def IIC_BTX_RI : InstrItinClass; +def IIC_BTX_RR : InstrItinClass; +def IIC_XCHG_REG : InstrItinClass; +def IIC_XCHG_MEM : InstrItinClass; +def IIC_XADD_REG : InstrItinClass; +def IIC_XADD_MEM : InstrItinClass; +def IIC_CMPXCHG_MEM : InstrItinClass; +def IIC_CMPXCHG_REG : InstrItinClass; +def IIC_CMPXCHG_MEM8 : InstrItinClass; +def IIC_CMPXCHG_REG8 : InstrItinClass; +def IIC_CMPXCHG_8B : InstrItinClass; +def IIC_CMPXCHG_16B : InstrItinClass; +def IIC_LODS : InstrItinClass; +def IIC_OUTS : InstrItinClass; +def IIC_CLC : InstrItinClass; +def IIC_CLD : InstrItinClass; +def IIC_CLI : InstrItinClass; +def IIC_CMC : InstrItinClass; +def IIC_CLTS : InstrItinClass; +def IIC_STC : InstrItinClass; +def IIC_STI : InstrItinClass; +def IIC_STD : InstrItinClass; +def IIC_XLAT : InstrItinClass; +def IIC_AAA : InstrItinClass; +def IIC_AAD : InstrItinClass; +def IIC_AAM : InstrItinClass; +def IIC_AAS : InstrItinClass; +def IIC_DAA : InstrItinClass; +def IIC_DAS : InstrItinClass; +def IIC_BOUND : InstrItinClass; +def IIC_ARPL_REG : InstrItinClass; +def IIC_ARPL_MEM : InstrItinClass; +def IIC_MOVBE : InstrItinClass; +def IIC_AES : InstrItinClass; +def IIC_BLEND_MEM : InstrItinClass; +def IIC_BLEND_NOMEM : InstrItinClass; +def IIC_CBW : InstrItinClass; +def IIC_CRC32_REG : InstrItinClass; +def IIC_CRC32_MEM : InstrItinClass; +def IIC_SSE_DPPD_RR : InstrItinClass; +def IIC_SSE_DPPD_RM : InstrItinClass; +def IIC_SSE_DPPS_RR : InstrItinClass; +def IIC_SSE_DPPS_RM : InstrItinClass; +def IIC_MMX_EMMS : InstrItinClass; +def IIC_SSE_EXTRACTPS_RR : InstrItinClass; +def IIC_SSE_EXTRACTPS_RM : InstrItinClass; +def IIC_SSE_INSERTPS_RR : InstrItinClass; +def IIC_SSE_INSERTPS_RM : InstrItinClass; +def IIC_SSE_MPSADBW_RR : InstrItinClass; +def IIC_SSE_MPSADBW_RM : InstrItinClass; +def IIC_SSE_PMULLD_RR : InstrItinClass; +def IIC_SSE_PMULLD_RM : InstrItinClass; +def IIC_SSE_ROUNDPS_REG : InstrItinClass; +def IIC_SSE_ROUNDPS_MEM : InstrItinClass; +def IIC_SSE_ROUNDPD_REG : InstrItinClass; +def IIC_SSE_ROUNDPD_MEM : InstrItinClass; +def IIC_SSE_POPCNT_RR : InstrItinClass; +def IIC_SSE_POPCNT_RM : InstrItinClass; +def IIC_SSE_PCLMULQDQ_RR : InstrItinClass; +def IIC_SSE_PCLMULQDQ_RM : InstrItinClass; + +def IIC_NOP : InstrItinClass; + +//===----------------------------------------------------------------------===// +// Processor instruction itineraries. + +// IssueWidth is analogous to the number of decode units. Core and its +// descendents, including Nehalem and SandyBridge have 4 decoders. +// Resources beyond the decoder operate on micro-ops and are bufferred +// so adjacent micro-ops don't directly compete. +// +// MicroOpBufferSize > 1 indicates that RAW dependencies can be +// decoded in the same cycle. The value 32 is a reasonably arbitrary +// number of in-flight instructions. +// +// HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef +// indicates high latency opcodes. Alternatively, InstrItinData +// entries may be included here to define specific operand +// latencies. Since these latencies are not used for pipeline hazards, +// they do not need to be exact. +// +// The GenericModel contains no instruction itineraries. +def GenericModel : SchedMachineModel { + let IssueWidth = 4; + let MicroOpBufferSize = 32; + let LoadLatency = 4; + let HighLatency = 10; + let PostRAScheduler = 0; +} + +include "X86ScheduleAtom.td" +include "X86SchedSandyBridge.td" +include "X86SchedHaswell.td" +include "X86ScheduleSLM.td" +include "X86ScheduleBtVer2.td" + diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td new file mode 100644 index 0000000..4c559c9 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -0,0 +1,549 @@ +//===- X86ScheduleAtom.td - X86 Atom Scheduling Definitions -*- tablegen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the Intel Atom +// in order (Saltwell-32nm/Bonnell-45nm) processors. +// +//===----------------------------------------------------------------------===// + +// +// Scheduling information derived from the "Intel 64 and IA32 Architectures +// Optimization Reference Manual", Chapter 13, Section 4. +// Functional Units +// Port 0 +def Port0 : FuncUnit; // ALU: ALU0, shift/rotate, load/store + // SIMD/FP: SIMD ALU, Shuffle,SIMD/FP multiply, divide +def Port1 : FuncUnit; // ALU: ALU1, bit processing, jump, and LEA + // SIMD/FP: SIMD ALU, FP Adder + +def AtomItineraries : ProcessorItineraries< + [ Port0, Port1 ], + [], [ + // P0 only + // InstrItinData<class, [InstrStage<N, [P0]>] >, + // P0 or P1 + // InstrItinData<class, [InstrStage<N, [P0, P1]>] >, + // P0 and P1 + // InstrItinData<class, [InstrStage<N, [P0], 0>, InstrStage<N, [P1]>] >, + // + // Default is 1 cycle, port0 or port1 + InstrItinData<IIC_ALU_MEM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_LEA, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_LEA_16, [InstrStage<2, [Port0, Port1]>] >, + // mul + InstrItinData<IIC_MUL8, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_MUL16_MEM, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_MUL16_REG, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_MUL32_MEM, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_MUL32_REG, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_MUL64, [InstrStage<12, [Port0, Port1]>] >, + // imul by al, ax, eax, rax + InstrItinData<IIC_IMUL8, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL16_MEM, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL16_REG, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL32_MEM, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL64, [InstrStage<12, [Port0, Port1]>] >, + // imul reg by reg|mem + InstrItinData<IIC_IMUL16_RM, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL16_RR, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL32_RM, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_IMUL32_RR, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_IMUL64_RM, [InstrStage<12, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL64_RR, [InstrStage<12, [Port0, Port1]>] >, + // imul reg = reg/mem * imm + InstrItinData<IIC_IMUL16_RRI, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL32_RRI, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_IMUL64_RRI, [InstrStage<14, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL16_RMI, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_IMUL32_RMI, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_IMUL64_RMI, [InstrStage<14, [Port0, Port1]>] >, + // idiv + InstrItinData<IIC_IDIV8, [InstrStage<62, [Port0, Port1]>] >, + InstrItinData<IIC_IDIV16, [InstrStage<62, [Port0, Port1]>] >, + InstrItinData<IIC_IDIV32, [InstrStage<62, [Port0, Port1]>] >, + InstrItinData<IIC_IDIV64, [InstrStage<130, [Port0, Port1]>] >, + // div + InstrItinData<IIC_DIV8_REG, [InstrStage<50, [Port0, Port1]>] >, + InstrItinData<IIC_DIV8_MEM, [InstrStage<68, [Port0, Port1]>] >, + InstrItinData<IIC_DIV16, [InstrStage<50, [Port0, Port1]>] >, + InstrItinData<IIC_DIV32, [InstrStage<50, [Port0, Port1]>] >, + InstrItinData<IIC_DIV64, [InstrStage<130, [Port0, Port1]>] >, + // neg/not/inc/dec + InstrItinData<IIC_UNARY_REG, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [Port0]>] >, + // add/sub/and/or/xor/cmp/test + InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_BIN_MEM, [InstrStage<1, [Port0]>] >, + // adc/sbc + InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<1, [Port0]>] >, + // shift/rotate + InstrItinData<IIC_SR, [InstrStage<1, [Port0]>] >, + // shift double + InstrItinData<IIC_SHD16_REG_IM, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SHD16_REG_CL, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_SHD32_REG_CL, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SHD64_REG_IM, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SHD64_REG_CL, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<9, [Port0, Port1]>] >, + // cmov + InstrItinData<IIC_CMOV16_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_CMOV16_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_CMOV32_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_CMOV32_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_CMOV64_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_CMOV64_RR, [InstrStage<1, [Port0, Port1]>] >, + // set + InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_SET_R, [InstrStage<1, [Port0, Port1]>] >, + // jcc + InstrItinData<IIC_Jcc, [InstrStage<1, [Port1]>] >, + // jcxz/jecxz/jrcxz + InstrItinData<IIC_JCXZ, [InstrStage<4, [Port0, Port1]>] >, + // jmp rel + InstrItinData<IIC_JMP_REL, [InstrStage<1, [Port1]>] >, + // jmp indirect + InstrItinData<IIC_JMP_REG, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_JMP_MEM, [InstrStage<2, [Port0, Port1]>] >, + // jmp far + InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<32, [Port0, Port1]>] >, + InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<31, [Port0, Port1]>] >, + // loop/loope/loopne + InstrItinData<IIC_LOOP, [InstrStage<18, [Port0, Port1]>] >, + InstrItinData<IIC_LOOPE, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_LOOPNE, [InstrStage<17, [Port0, Port1]>] >, + // call - all but reg/imm + InstrItinData<IIC_CALL_RI, [InstrStage<1, [Port0], 0>, + InstrStage<1, [Port1]>] >, + InstrItinData<IIC_CALL_MEM, [InstrStage<15, [Port0, Port1]>] >, + InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<40, [Port0, Port1]>] >, + InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<39, [Port0, Port1]>] >, + //ret + InstrItinData<IIC_RET, [InstrStage<79, [Port0, Port1]>] >, + InstrItinData<IIC_RET_IMM, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >, + //sign extension movs + InstrItinData<IIC_MOVSX,[InstrStage<1, [Port0] >] >, + InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [Port0, Port1]>] >, + //zero extension movs + InstrItinData<IIC_MOVZX,[InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<3, [Port0, Port1]>] >, + + InstrItinData<IIC_REP_MOVS, [InstrStage<75, [Port0, Port1]>] >, + InstrItinData<IIC_REP_STOS, [InstrStage<74, [Port0, Port1]>] >, + + // SSE binary operations + // arithmetic fp scalar + InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<5, [Port1]>] >, + InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<5, [Port0], 0>, + InstrStage<5, [Port1]>] >, + InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<5, [Port1]>] >, + InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<5, [Port0], 0>, + InstrStage<5, [Port1]>] >, + InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<4, [Port0]>] >, + InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<4, [Port0]>] >, + InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<34, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<34, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<62, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<62, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<10, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<9, [Port0, Port1]>] >, + + // arithmetic fp parallel + InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<5, [Port1]>] >, + InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<5, [Port0], 0>, + InstrStage<5, [Port1]>] >, + InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<70, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<70, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<125, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<125, [Port0, Port1]>] >, + + // bitwise parallel + InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [Port0]>] >, + + // arithmetic int parallel + InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<3, [Port0, Port1]>] >, + + // multiply int parallel + InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [Port0]>] >, + + // shift parallel + InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<70, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<70, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<34, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<34, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<125, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<125, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_RSQRTPS_RR, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_RSQRTPS_RM, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_RSQRTSS_RR, [InstrStage<4, [Port0]>] >, + InstrItinData<IIC_SSE_RSQRTSS_RM, [InstrStage<4, [Port0]>] >, + + InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >, + InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [Port0]>] >, + + InstrItinData<IIC_SSE_MOVMSK, [InstrStage<3, [Port0]>] >, + InstrItinData<IIC_SSE_MASKMOV, [InstrStage<2, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<2, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_LDDQU, [InstrStage<3, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<3, [Port0]>] >, + InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_PAUSE, [InstrStage<17, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<5, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_STMXCSR, [InstrStage<15, [Port0, Port1]>] >, + + InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [Port0]>] >, + + InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [Port0]>] >, + InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_MWAIT, [InstrStage<46, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_MONITOR, [InstrStage<45, [Port0, Port1]>] >, + + // conversions + // to/from PD ... + InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >, + // to/from PS except to/from PD and PS2PI + InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<9, [Port0, Port1]>] >, + + // MMX MOVs + InstrItinData<IIC_MMX_MOV_MM_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<3, [Port0]>] >, + InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >, + // other MMX + InstrItinData<IIC_MMX_ALU_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_ALU_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<5, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PMUL, [InstrStage<4, [Port0]>] >, + InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PSADBW, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_PCK_RM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_PCK_RR, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PSHUF, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_PEXTR, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_PINSRW, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [Port0]>] >, + // conversions + // from/to PD + InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >, + // from/to PI + InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<5, [Port1]>] >, + InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<5, [Port0], 0>, + InstrStage<5, [Port1]>]>, + + InstrItinData<IIC_CMPX_LOCK, [InstrStage<14, [Port0, Port1]>] >, + InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<18, [Port0, Port1]>] >, + InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<22, [Port0, Port1]>] >, + + InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] >, + + InstrItinData<IIC_FILD, [InstrStage<5, [Port0], 0>, InstrStage<5, [Port1]>] >, + InstrItinData<IIC_FLD, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_FLD80, [InstrStage<4, [Port0, Port1]>] >, + + InstrItinData<IIC_FST, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_FST80, [InstrStage<5, [Port0, Port1]>] >, + InstrItinData<IIC_FIST, [InstrStage<6, [Port0, Port1]>] >, + + InstrItinData<IIC_FLDZ, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_FUCOM, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_FUCOMI, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_FCOMI, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_FNSTSW, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_FNSTCW, [InstrStage<8, [Port0, Port1]>] >, + InstrItinData<IIC_FLDCW, [InstrStage<5, [Port0, Port1]>] >, + InstrItinData<IIC_FNINIT, [InstrStage<63, [Port0, Port1]>] >, + InstrItinData<IIC_FFREE, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_FNCLEX, [InstrStage<25, [Port0, Port1]>] >, + InstrItinData<IIC_WAIT, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_FXAM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_FNOP, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_FLDL, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_F2XM1, [InstrStage<99, [Port0, Port1]>] >, + InstrItinData<IIC_FYL2X, [InstrStage<146, [Port0, Port1]>] >, + InstrItinData<IIC_FPTAN, [InstrStage<168, [Port0, Port1]>] >, + InstrItinData<IIC_FPATAN, [InstrStage<183, [Port0, Port1]>] >, + InstrItinData<IIC_FXTRACT, [InstrStage<25, [Port0, Port1]>] >, + InstrItinData<IIC_FPREM1, [InstrStage<71, [Port0, Port1]>] >, + InstrItinData<IIC_FPSTP, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_FPREM, [InstrStage<55, [Port0, Port1]>] >, + InstrItinData<IIC_FYL2XP1, [InstrStage<147, [Port0, Port1]>] >, + InstrItinData<IIC_FSINCOS, [InstrStage<174, [Port0, Port1]>] >, + InstrItinData<IIC_FRNDINT, [InstrStage<46, [Port0, Port1]>] >, + InstrItinData<IIC_FSCALE, [InstrStage<77, [Port0, Port1]>] >, + InstrItinData<IIC_FCOMPP, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_FXSAVE, [InstrStage<140, [Port0, Port1]>] >, + InstrItinData<IIC_FXRSTOR, [InstrStage<141, [Port0, Port1]>] >, + InstrItinData<IIC_FXCH, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >, + + // System instructions + InstrItinData<IIC_CPUID, [InstrStage<121, [Port0, Port1]>] >, + InstrItinData<IIC_INT, [InstrStage<127, [Port0, Port1]>] >, + InstrItinData<IIC_INT3, [InstrStage<130, [Port0, Port1]>] >, + InstrItinData<IIC_INVD, [InstrStage<1003, [Port0, Port1]>] >, + InstrItinData<IIC_INVLPG, [InstrStage<71, [Port0, Port1]>] >, + InstrItinData<IIC_IRET, [InstrStage<109, [Port0, Port1]>] >, + InstrItinData<IIC_HLT, [InstrStage<121, [Port0, Port1]>] >, + InstrItinData<IIC_LXS, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_LTR, [InstrStage<83, [Port0, Port1]>] >, + InstrItinData<IIC_RDTSC, [InstrStage<30, [Port0, Port1]>] >, + InstrItinData<IIC_RSM, [InstrStage<741, [Port0, Port1]>] >, + InstrItinData<IIC_SIDT, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SGDT, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_SLDT, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_STR, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_SWAPGS, [InstrStage<22, [Port0, Port1]>] >, + InstrItinData<IIC_SYSCALL, [InstrStage<96, [Port0, Port1]>] >, + InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<88, [Port0, Port1]>] >, + + InstrItinData<IIC_IN_RR, [InstrStage<94, [Port0, Port1]>] >, + InstrItinData<IIC_IN_RI, [InstrStage<92, [Port0, Port1]>] >, + InstrItinData<IIC_OUT_RR, [InstrStage<68, [Port0, Port1]>] >, + InstrItinData<IIC_OUT_IR, [InstrStage<72, [Port0, Port1]>] >, + InstrItinData<IIC_INS, [InstrStage<59, [Port0, Port1]>] >, + + InstrItinData<IIC_MOV_REG_DR, [InstrStage<88, [Port0, Port1]>] >, + InstrItinData<IIC_MOV_DR_REG, [InstrStage<123, [Port0, Port1]>] >, + // worst case for mov REG_CRx + InstrItinData<IIC_MOV_REG_CR, [InstrStage<12, [Port0, Port1]>] >, + InstrItinData<IIC_MOV_CR_REG, [InstrStage<136, [Port0, Port1]>] >, + + InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MOV_MEM_SR, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_MOV_SR_REG, [InstrStage<21, [Port0, Port1]>] >, + InstrItinData<IIC_MOV_SR_MEM, [InstrStage<26, [Port0, Port1]>] >, + // LAR + InstrItinData<IIC_LAR_RM, [InstrStage<50, [Port0, Port1]>] >, + InstrItinData<IIC_LAR_RR, [InstrStage<54, [Port0, Port1]>] >, + // LSL + InstrItinData<IIC_LSL_RM, [InstrStage<46, [Port0, Port1]>] >, + InstrItinData<IIC_LSL_RR, [InstrStage<49, [Port0, Port1]>] >, + + InstrItinData<IIC_LGDT, [InstrStage<44, [Port0, Port1]>] >, + InstrItinData<IIC_LIDT, [InstrStage<44, [Port0, Port1]>] >, + InstrItinData<IIC_LLDT_REG, [InstrStage<60, [Port0, Port1]>] >, + InstrItinData<IIC_LLDT_MEM, [InstrStage<64, [Port0, Port1]>] >, + // push control register, segment registers + InstrItinData<IIC_PUSH_CS, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_PUSH_SR, [InstrStage<2, [Port0, Port1]>] >, + // pop control register, segment registers + InstrItinData<IIC_POP_SR, [InstrStage<29, [Port0, Port1]>] >, + InstrItinData<IIC_POP_SR_SS, [InstrStage<48, [Port0, Port1]>] >, + // VERR, VERW + InstrItinData<IIC_VERR, [InstrStage<41, [Port0, Port1]>] >, + InstrItinData<IIC_VERW_REG, [InstrStage<51, [Port0, Port1]>] >, + InstrItinData<IIC_VERW_MEM, [InstrStage<50, [Port0, Port1]>] >, + // WRMSR, RDMSR + InstrItinData<IIC_WRMSR, [InstrStage<202, [Port0, Port1]>] >, + InstrItinData<IIC_RDMSR, [InstrStage<78, [Port0, Port1]>] >, + InstrItinData<IIC_RDPMC, [InstrStage<46, [Port0, Port1]>] >, + // SMSW, LMSW + InstrItinData<IIC_SMSW, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_LMSW_REG, [InstrStage<69, [Port0, Port1]>] >, + InstrItinData<IIC_LMSW_MEM, [InstrStage<67, [Port0, Port1]>] >, + + InstrItinData<IIC_ENTER, [InstrStage<32, [Port0, Port1]>] >, + InstrItinData<IIC_LEAVE, [InstrStage<2, [Port0, Port1]>] >, + + InstrItinData<IIC_POP_MEM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_POP_REG16, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_POP_REG, [InstrStage<1, [Port0], 0>, + InstrStage<1, [Port1]>] >, + InstrItinData<IIC_POP_F, [InstrStage<32, [Port0, Port1]>] >, + InstrItinData<IIC_POP_FD, [InstrStage<26, [Port0, Port1]>] >, + InstrItinData<IIC_POP_A, [InstrStage<9, [Port0, Port1]>] >, + + InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [Port0], 0>, + InstrStage<1, [Port1]>] >, + InstrItinData<IIC_PUSH_MEM, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_PUSH_REG, [InstrStage<1, [Port0], 0>, + InstrStage<1, [Port1]>] >, + InstrItinData<IIC_PUSH_F, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_PUSH_A, [InstrStage<8, [Port0, Port1]>] >, + + InstrItinData<IIC_BSWAP, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<16, [Port0, Port1]>] >, + InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<16, [Port0, Port1]>] >, + InstrItinData<IIC_MOVS, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_STOS, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_SCAS, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_CMPS, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_MOV, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_MOV_MEM, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_AHF, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_BT_MI, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_BT_MR, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_BT_RI, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_BT_RR, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_BTX_MI, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_BTX_MR, [InstrStage<11, [Port0, Port1]>] >, + InstrItinData<IIC_BTX_RI, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_BTX_RR, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_XCHG_REG, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_XCHG_MEM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_XADD_REG, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_XADD_MEM, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<14, [Port0, Port1]>] >, + InstrItinData<IIC_CMPXCHG_REG, [InstrStage<15, [Port0, Port1]>] >, + InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_CMPXCHG_8B, [InstrStage<18, [Port0, Port1]>] >, + InstrItinData<IIC_CMPXCHG_16B, [InstrStage<22, [Port0, Port1]>] >, + InstrItinData<IIC_LODS, [InstrStage<2, [Port0, Port1]>] >, + InstrItinData<IIC_OUTS, [InstrStage<74, [Port0, Port1]>] >, + InstrItinData<IIC_CLC, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_CLD, [InstrStage<3, [Port0, Port1]>] >, + InstrItinData<IIC_CLI, [InstrStage<14, [Port0, Port1]>] >, + InstrItinData<IIC_CMC, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_CLTS, [InstrStage<33, [Port0, Port1]>] >, + InstrItinData<IIC_STC, [InstrStage<1, [Port0, Port1]>] >, + InstrItinData<IIC_STI, [InstrStage<17, [Port0, Port1]>] >, + InstrItinData<IIC_STD, [InstrStage<21, [Port0, Port1]>] >, + InstrItinData<IIC_XLAT, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_AAA, [InstrStage<13, [Port0, Port1]>] >, + InstrItinData<IIC_AAD, [InstrStage<7, [Port0, Port1]>] >, + InstrItinData<IIC_AAM, [InstrStage<21, [Port0, Port1]>] >, + InstrItinData<IIC_AAS, [InstrStage<13, [Port0, Port1]>] >, + InstrItinData<IIC_DAA, [InstrStage<18, [Port0, Port1]>] >, + InstrItinData<IIC_DAS, [InstrStage<20, [Port0, Port1]>] >, + InstrItinData<IIC_BOUND, [InstrStage<11, [Port0, Port1]>] >, + InstrItinData<IIC_ARPL_REG, [InstrStage<24, [Port0, Port1]>] >, + InstrItinData<IIC_ARPL_MEM, [InstrStage<23, [Port0, Port1]>] >, + InstrItinData<IIC_MOVBE, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_CBW, [InstrStage<4, [Port0, Port1]>] >, + InstrItinData<IIC_MMX_EMMS, [InstrStage<5, [Port0, Port1]>] >, + + InstrItinData<IIC_NOP, [InstrStage<1, [Port0, Port1]>] > + ]>; + +// Atom machine model. +def AtomModel : SchedMachineModel { + let IssueWidth = 2; // Allows 2 instructions per scheduling group. + let MicroOpBufferSize = 0; // In-order execution, always hide latency. + let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles. + let HighLatency = 30;// Expected, may be overriden by OperandCycles. + + // On the Atom, the throughput for taken branches is 2 cycles. For small + // simple loops, expand by a small factor to hide the backedge cost. + let LoopMicroOpBufferSize = 10; + let PostRAScheduler = 1; + + let Itineraries = AtomItineraries; +} diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td new file mode 100644 index 0000000..ce1ece3 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -0,0 +1,341 @@ +//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for AMD btver2 (Jaguar) to support +// instruction scheduling and other instruction cost heuristics. Based off AMD Software +// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix. +// +//===----------------------------------------------------------------------===// + +def BtVer2Model : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and btver2 can + // decode 2 instructions per cycle. + let IssueWidth = 2; + let MicroOpBufferSize = 64; // Retire Control Unit + let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency) + let HighLatency = 25; + let MispredictPenalty = 14; // Minimum branch misdirection penalty + let PostRAScheduler = 1; + + // FIXME: SSE4/AVX is unimplemented. This flag is set to allow + // the scheduler to assign a default model to unrecognized opcodes. + let CompleteModel = 0; +} + +let SchedModel = BtVer2Model in { + +// Jaguar can issue up to 6 micro-ops in one cycle +def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam) +def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV +def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU +def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA) +def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA +def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM + +// Any pipe - FIXME we need this until we can discriminate between int/fpu load/store/moves properly +def JAny : ProcResGroup<[JALU0, JALU1, JLAGU, JSAGU, JFPU0, JFPU1]>; + +// Integer Pipe Scheduler +def JALU01 : ProcResGroup<[JALU0, JALU1]> { + let BufferSize=20; +} + +// AGU Pipe Scheduler +def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> { + let BufferSize=12; +} + +// Fpu Pipe Scheduler +def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> { + let BufferSize=18; +} + +def JDiv : ProcResource<1>; // integer division +def JMul : ProcResource<1>; // integer multiplication +def JVALU0 : ProcResource<1>; // vector integer +def JVALU1 : ProcResource<1>; // vector integer +def JVIMUL : ProcResource<1>; // vector integer multiplication +def JSTC : ProcResource<1>; // vector store/convert +def JFPM : ProcResource<1>; // FP multiplication +def JFPA : ProcResource<1>; // FP addition + +// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 3>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when dispatched by the schedulers. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> { + let Latency = !add(Lat, 3); + } +} + +multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> { + let Latency = !add(Lat, 5); + } +} + +// A folded store needs a cycle on the SAGU for the store data. +def : WriteRes<WriteRMW, [JSAGU]>; + +//////////////////////////////////////////////////////////////////////////////// +// Arithmetic. +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResIntPair<WriteALU, JALU01, 1>; +defm : JWriteResIntPair<WriteIMul, JALU1, 3>; + +def : WriteRes<WriteIMulH, [JALU1]> { + let Latency = 6; + let ResourceCycles = [4]; +} + +// FIXME 8/16 bit divisions +def : WriteRes<WriteIDiv, [JALU1, JDiv]> { + let Latency = 25; + let ResourceCycles = [1, 25]; +} +def : WriteRes<WriteIDivLd, [JALU1, JLAGU, JDiv]> { + let Latency = 41; + let ResourceCycles = [1, 1, 25]; +} + +// This is for simple LEAs with one or two input operands. +// FIXME: SAGU 3-operand LEA +def : WriteRes<WriteLEA, [JALU01]>; + +//////////////////////////////////////////////////////////////////////////////// +// Integer shifts and rotates. +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResIntPair<WriteShift, JALU01, 1>; + +//////////////////////////////////////////////////////////////////////////////// +// Loads, stores, and moves, not folded with other operations. +// FIXME: Split x86 and SSE load/store/moves +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 5; } +def : WriteRes<WriteStore, [JSAGU]>; +def : WriteRes<WriteMove, [JAny]>; + +//////////////////////////////////////////////////////////////////////////////// +// Idioms that clear a register, like xorps %xmm0, %xmm0. +// These can often bypass execution ports completely. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteZero, []>; + +//////////////////////////////////////////////////////////////////////////////// +// Branches don't produce values, so they have no latency, but they still +// consume resources. Indirect branches can fold loads. +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResIntPair<WriteJump, JALU01, 1>; + +//////////////////////////////////////////////////////////////////////////////// +// Floating point. This covers both scalar and vector operations. +// FIXME: should we bother splitting JFPU pipe + unit stages for fast instructions? +// FIXME: Double precision latencies +// FIXME: SS vs PS latencies +// FIXME: ymm latencies +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>; +defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>; +defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>; +defm : JWriteResFpuPair<WriteFRsqrt, JFPU1, 2>; +defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>; +defm : JWriteResFpuPair<WriteFBlend, JFPU01, 1>; +defm : JWriteResFpuPair<WriteFShuffle256, JFPU01, 1>; + +def : WriteRes<WriteFSqrt, [JFPU1, JLAGU, JFPM]> { + let Latency = 21; + let ResourceCycles = [1, 1, 21]; +} +def : WriteRes<WriteFSqrtLd, [JFPU1, JLAGU, JFPM]> { + let Latency = 26; + let ResourceCycles = [1, 1, 21]; +} + +def : WriteRes<WriteFDiv, [JFPU1, JLAGU, JFPM]> { + let Latency = 19; + let ResourceCycles = [1, 1, 19]; +} +def : WriteRes<WriteFDivLd, [JFPU1, JLAGU, JFPM]> { + let Latency = 24; + let ResourceCycles = [1, 1, 19]; +} + +// FIXME: integer pipes +defm : JWriteResFpuPair<WriteCvtF2I, JFPU1, 3>; // Float -> Integer. +defm : JWriteResFpuPair<WriteCvtI2F, JFPU1, 3>; // Integer -> Float. +defm : JWriteResFpuPair<WriteCvtF2F, JFPU1, 3>; // Float -> Float size conversion. + +def : WriteRes<WriteFVarBlend, [JFPU01]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteFVarBlendLd, [JLAGU, JFPU01]> { + let Latency = 7; + let ResourceCycles = [1, 2]; +} + +// Vector integer operations. +defm : JWriteResFpuPair<WriteVecALU, JFPU01, 1>; +defm : JWriteResFpuPair<WriteVecShift, JFPU01, 1>; +defm : JWriteResFpuPair<WriteVecIMul, JFPU0, 2>; +defm : JWriteResFpuPair<WriteShuffle, JFPU01, 1>; +defm : JWriteResFpuPair<WriteBlend, JFPU01, 1>; +defm : JWriteResFpuPair<WriteVecLogic, JFPU01, 1>; +defm : JWriteResFpuPair<WriteShuffle256, JFPU01, 1>; + +def : WriteRes<WriteVarBlend, [JFPU01]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteVarBlendLd, [JLAGU, JFPU01]> { + let Latency = 7; + let ResourceCycles = [1, 2]; +} + +// FIXME: why do we need to define AVX2 resource on CPU that doesn't have AVX2? +def : WriteRes<WriteVarVecShift, [JFPU01]> { + let Latency = 1; + let ResourceCycles = [1]; +} +def : WriteRes<WriteVarVecShiftLd, [JLAGU, JFPU01]> { + let Latency = 6; + let ResourceCycles = [1, 1]; +} + +def : WriteRes<WriteMPSAD, [JFPU0]> { + let Latency = 3; + let ResourceCycles = [2]; +} +def : WriteRes<WriteMPSADLd, [JLAGU, JFPU0]> { + let Latency = 8; + let ResourceCycles = [1, 2]; +} + +//////////////////////////////////////////////////////////////////////////////// +// String instructions. +// Packed Compare Implicit Length Strings, Return Mask +// FIXME: approximate latencies + pipe dependencies +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WritePCmpIStrM, [JFPU01]> { + let Latency = 7; + let ResourceCycles = [2]; +} +def : WriteRes<WritePCmpIStrMLd, [JLAGU, JFPU01]> { + let Latency = 12; + let ResourceCycles = [1, 2]; +} + +// Packed Compare Explicit Length Strings, Return Mask +def : WriteRes<WritePCmpEStrM, [JFPU01]> { + let Latency = 13; + let ResourceCycles = [5]; +} +def : WriteRes<WritePCmpEStrMLd, [JLAGU, JFPU01]> { + let Latency = 18; + let ResourceCycles = [1, 5]; +} + +// Packed Compare Implicit Length Strings, Return Index +def : WriteRes<WritePCmpIStrI, [JFPU01]> { + let Latency = 6; + let ResourceCycles = [2]; +} +def : WriteRes<WritePCmpIStrILd, [JLAGU, JFPU01]> { + let Latency = 11; + let ResourceCycles = [1, 2]; +} + +// Packed Compare Explicit Length Strings, Return Index +def : WriteRes<WritePCmpEStrI, [JFPU01]> { + let Latency = 13; + let ResourceCycles = [5]; +} +def : WriteRes<WritePCmpEStrILd, [JLAGU, JFPU01]> { + let Latency = 18; + let ResourceCycles = [1, 5]; +} + +//////////////////////////////////////////////////////////////////////////////// +// AES Instructions. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteAESDecEnc, [JFPU01, JVIMUL]> { + let Latency = 3; + let ResourceCycles = [1, 1]; +} +def : WriteRes<WriteAESDecEncLd, [JFPU01, JLAGU, JVIMUL]> { + let Latency = 8; + let ResourceCycles = [1, 1, 1]; +} + +def : WriteRes<WriteAESIMC, [JVIMUL]> { + let Latency = 2; + let ResourceCycles = [1]; +} +def : WriteRes<WriteAESIMCLd, [JLAGU, JVIMUL]> { + let Latency = 7; + let ResourceCycles = [1, 1]; +} + +def : WriteRes<WriteAESKeyGen, [JVIMUL]> { + let Latency = 2; + let ResourceCycles = [1]; +} +def : WriteRes<WriteAESKeyGenLd, [JLAGU, JVIMUL]> { + let Latency = 7; + let ResourceCycles = [1, 1]; +} + +//////////////////////////////////////////////////////////////////////////////// +// Carry-less multiplication instructions. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteCLMul, [JVIMUL]> { + let Latency = 2; + let ResourceCycles = [1]; +} +def : WriteRes<WriteCLMulLd, [JLAGU, JVIMUL]> { + let Latency = 7; + let ResourceCycles = [1, 1]; +} + +// FIXME: pipe for system/microcode? +def : WriteRes<WriteSystem, [JAny]> { let Latency = 100; } +def : WriteRes<WriteMicrocoded, [JAny]> { let Latency = 100; } +def : WriteRes<WriteFence, [JSAGU]>; +def : WriteRes<WriteNop, []>; +} // SchedModel + diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td new file mode 100644 index 0000000..f95d4fa --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -0,0 +1,233 @@ +//=- X86ScheduleSLM.td - X86 Silvermont Scheduling -----------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Intel Silvermont to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def SLMModel : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and SLM can decode 2 + // instructions per cycle. + let IssueWidth = 2; + let MicroOpBufferSize = 32; // Based on the reorder buffer. + let LoadLatency = 3; + let MispredictPenalty = 10; + let PostRAScheduler = 1; + + // For small loops, expand by a small factor to hide the backedge cost. + let LoopMicroOpBufferSize = 10; + + // FIXME: SSE4 is unimplemented. This flag is set to allow + // the scheduler to assign a default model to unrecognized opcodes. + let CompleteModel = 0; +} + +let SchedModel = SLMModel in { + +// Silvermont has 5 reservation stations for micro-ops + +def IEC_RSV0 : ProcResource<1>; +def IEC_RSV1 : ProcResource<1>; +def FPC_RSV0 : ProcResource<1> { let BufferSize = 1; } +def FPC_RSV1 : ProcResource<1> { let BufferSize = 1; } +def MEC_RSV : ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def IEC_RSV01 : ProcResGroup<[IEC_RSV0, IEC_RSV1]>; +def FPC_RSV01 : ProcResGroup<[FPC_RSV0, FPC_RSV1]>; + +def SMDivider : ProcResource<1>; +def SMFPMultiplier : ProcResource<1>; +def SMFPDivider : ProcResource<1>; + +// Loads are 3 cycles, so ReadAfterLd registers needn't be available until 3 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 3>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass SMWriteResPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on MEC_RSV and adds 3 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, [MEC_RSV, ExePort]> { + let Latency = !add(Lat, 3); + } +} + +// A folded store needs a cycle on MEC_RSV for the store data, but it does not +// need an extra port cycle to recompute the address. +def : WriteRes<WriteRMW, [MEC_RSV]>; + +def : WriteRes<WriteStore, [IEC_RSV01, MEC_RSV]>; +def : WriteRes<WriteLoad, [MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteMove, [IEC_RSV01]>; +def : WriteRes<WriteZero, []>; + +defm : SMWriteResPair<WriteALU, IEC_RSV01, 1>; +defm : SMWriteResPair<WriteIMul, IEC_RSV1, 3>; +defm : SMWriteResPair<WriteShift, IEC_RSV0, 1>; +defm : SMWriteResPair<WriteJump, IEC_RSV1, 1>; + +// This is for simple LEAs with one or two input operands. +// The complex ones can only execute on port 1, and they require two cycles on +// the port to read all inputs. We don't model that. +def : WriteRes<WriteLEA, [IEC_RSV1]>; + +// This is quite rough, latency depends on the dividend. +def : WriteRes<WriteIDiv, [IEC_RSV01, SMDivider]> { + let Latency = 25; + let ResourceCycles = [1, 25]; +} +def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> { + let Latency = 29; + let ResourceCycles = [1, 1, 25]; +} + +// Scalar and vector floating point. +defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>; +defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>; +defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>; +defm : SMWriteResPair<WriteFSqrt, FPC_RSV0, 15>; +defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>; +defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>; +defm : SMWriteResPair<WriteCvtF2F, FPC_RSV01, 4>; +defm : SMWriteResPair<WriteFShuffle, FPC_RSV0, 1>; +defm : SMWriteResPair<WriteFBlend, FPC_RSV0, 1>; + +// This is quite rough, latency depends on precision +def : WriteRes<WriteFMul, [FPC_RSV0, SMFPMultiplier]> { + let Latency = 5; + let ResourceCycles = [1, 2]; +} +def : WriteRes<WriteFMulLd, [MEC_RSV, FPC_RSV0, SMFPMultiplier]> { + let Latency = 8; + let ResourceCycles = [1, 1, 2]; +} + +def : WriteRes<WriteFDiv, [FPC_RSV0, SMFPDivider]> { + let Latency = 34; + let ResourceCycles = [1, 34]; +} +def : WriteRes<WriteFDivLd, [MEC_RSV, FPC_RSV0, SMFPDivider]> { + let Latency = 37; + let ResourceCycles = [1, 1, 34]; +} + +// Vector integer operations. +defm : SMWriteResPair<WriteVecShift, FPC_RSV0, 1>; +defm : SMWriteResPair<WriteVecLogic, FPC_RSV01, 1>; +defm : SMWriteResPair<WriteVecALU, FPC_RSV01, 1>; +defm : SMWriteResPair<WriteVecIMul, FPC_RSV0, 4>; +defm : SMWriteResPair<WriteShuffle, FPC_RSV0, 1>; +defm : SMWriteResPair<WriteBlend, FPC_RSV0, 1>; +defm : SMWriteResPair<WriteMPSAD, FPC_RSV0, 7>; + +// String instructions. +// Packed Compare Implicit Length Strings, Return Mask +def : WriteRes<WritePCmpIStrM, [FPC_RSV0]> { + let Latency = 13; + let ResourceCycles = [13]; +} +def : WriteRes<WritePCmpIStrMLd, [FPC_RSV0, MEC_RSV]> { + let Latency = 13; + let ResourceCycles = [13, 1]; +} + +// Packed Compare Explicit Length Strings, Return Mask +def : WriteRes<WritePCmpEStrM, [FPC_RSV0]> { + let Latency = 17; + let ResourceCycles = [17]; +} +def : WriteRes<WritePCmpEStrMLd, [FPC_RSV0, MEC_RSV]> { + let Latency = 17; + let ResourceCycles = [17, 1]; +} + +// Packed Compare Implicit Length Strings, Return Index +def : WriteRes<WritePCmpIStrI, [FPC_RSV0]> { + let Latency = 17; + let ResourceCycles = [17]; +} +def : WriteRes<WritePCmpIStrILd, [FPC_RSV0, MEC_RSV]> { + let Latency = 17; + let ResourceCycles = [17, 1]; +} + +// Packed Compare Explicit Length Strings, Return Index +def : WriteRes<WritePCmpEStrI, [FPC_RSV0]> { + let Latency = 21; + let ResourceCycles = [21]; +} +def : WriteRes<WritePCmpEStrILd, [FPC_RSV0, MEC_RSV]> { + let Latency = 21; + let ResourceCycles = [21, 1]; +} + +// AES Instructions. +def : WriteRes<WriteAESDecEnc, [FPC_RSV0]> { + let Latency = 8; + let ResourceCycles = [5]; +} +def : WriteRes<WriteAESDecEncLd, [FPC_RSV0, MEC_RSV]> { + let Latency = 8; + let ResourceCycles = [5, 1]; +} + +def : WriteRes<WriteAESIMC, [FPC_RSV0]> { + let Latency = 8; + let ResourceCycles = [5]; +} +def : WriteRes<WriteAESIMCLd, [FPC_RSV0, MEC_RSV]> { + let Latency = 8; + let ResourceCycles = [5, 1]; +} + +def : WriteRes<WriteAESKeyGen, [FPC_RSV0]> { + let Latency = 8; + let ResourceCycles = [5]; +} +def : WriteRes<WriteAESKeyGenLd, [FPC_RSV0, MEC_RSV]> { + let Latency = 8; + let ResourceCycles = [5, 1]; +} + +// Carry-less multiplication instructions. +def : WriteRes<WriteCLMul, [FPC_RSV0]> { + let Latency = 10; + let ResourceCycles = [10]; +} +def : WriteRes<WriteCLMulLd, [FPC_RSV0, MEC_RSV]> { + let Latency = 10; + let ResourceCycles = [10, 1]; +} + + +def : WriteRes<WriteSystem, [FPC_RSV0]> { let Latency = 100; } +def : WriteRes<WriteMicrocoded, [FPC_RSV0]> { let Latency = 100; } +def : WriteRes<WriteFence, [MEC_RSV]>; +def : WriteRes<WriteNop, []>; + +// AVX is not supported on that architecture, but we should define the basic +// scheduling resources anyway. +def : WriteRes<WriteIMulH, [FPC_RSV0]>; +defm : SMWriteResPair<WriteVarBlend, FPC_RSV0, 1>; +defm : SMWriteResPair<WriteFVarBlend, FPC_RSV0, 1>; +defm : SMWriteResPair<WriteFShuffle256, FPC_RSV0, 1>; +defm : SMWriteResPair<WriteShuffle256, FPC_RSV0, 1>; +defm : SMWriteResPair<WriteVarVecShift, FPC_RSV0, 1>; +} // SchedModel diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp new file mode 100644 index 0000000..b1a0161 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -0,0 +1,284 @@ +//===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the X86SelectionDAGInfo class. +// +//===----------------------------------------------------------------------===// + +#include "X86InstrInfo.h" +#include "X86ISelLowering.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "X86SelectionDAGInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/Target/TargetLowering.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-selectiondag-info" + +bool X86SelectionDAGInfo::isBaseRegConflictPossible( + SelectionDAG &DAG, ArrayRef<unsigned> ClobberSet) const { + // We cannot use TRI->hasBasePointer() until *after* we select all basic + // blocks. Legalization may introduce new stack temporaries with large + // alignment requirements. Fall back to generic code if there are any + // dynamic stack adjustments (hopefully rare) and the base pointer would + // conflict if we had to use it. + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + if (!MFI->hasVarSizedObjects() && !MFI->hasOpaqueSPAdjustment()) + return false; + + const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); + unsigned BaseReg = TRI->getBaseRegister(); + for (unsigned R : ClobberSet) + if (BaseReg == R) + return true; + return false; +} + +SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( + SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, bool isVolatile, + MachinePointerInfo DstPtrInfo) const { + ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); + const X86Subtarget &Subtarget = + DAG.getMachineFunction().getSubtarget<X86Subtarget>(); + +#ifndef NDEBUG + // If the base register might conflict with our physical registers, bail out. + const unsigned ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI, + X86::ECX, X86::EAX, X86::EDI}; + assert(!isBaseRegConflictPossible(DAG, ClobberSet)); +#endif + + // If to a segment-relative address space, use the default lowering. + if (DstPtrInfo.getAddrSpace() >= 256) + return SDValue(); + + // If not DWORD aligned or size is more than the threshold, call the library. + // The libc version is likely to be faster for these cases. It can use the + // address value and run time information about the CPU. + if ((Align & 3) != 0 || !ConstantSize || + ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) { + // Check to see if there is a specialized entry-point for memory zeroing. + ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); + + if (const char *bzeroEntry = V && + V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout()); + Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + Entry.Node = Dst; + Entry.Ty = IntPtrTy; + Args.push_back(Entry); + Entry.Node = Size; + Args.push_back(Entry); + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl).setChain(Chain) + .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args), + 0) + .setDiscardResult(); + + std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI); + return CallResult.second; + } + + // Otherwise have the target-independent code call memset. + return SDValue(); + } + + uint64_t SizeVal = ConstantSize->getZExtValue(); + SDValue InFlag; + EVT AVT; + SDValue Count; + ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src); + unsigned BytesLeft = 0; + bool TwoRepStos = false; + if (ValC) { + unsigned ValReg; + uint64_t Val = ValC->getZExtValue() & 255; + + // If the value is a constant, then we can potentially use larger sets. + switch (Align & 3) { + case 2: // WORD aligned + AVT = MVT::i16; + ValReg = X86::AX; + Val = (Val << 8) | Val; + break; + case 0: // DWORD aligned + AVT = MVT::i32; + ValReg = X86::EAX; + Val = (Val << 8) | Val; + Val = (Val << 16) | Val; + if (Subtarget.is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned + AVT = MVT::i64; + ValReg = X86::RAX; + Val = (Val << 32) | Val; + } + break; + default: // Byte aligned + AVT = MVT::i8; + ValReg = X86::AL; + Count = DAG.getIntPtrConstant(SizeVal, dl); + break; + } + + if (AVT.bitsGT(MVT::i8)) { + unsigned UBytes = AVT.getSizeInBits() / 8; + Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl); + BytesLeft = SizeVal % UBytes; + } + + Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT), + InFlag); + InFlag = Chain.getValue(1); + } else { + AVT = MVT::i8; + Count = DAG.getIntPtrConstant(SizeVal, dl); + Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); + InFlag = Chain.getValue(1); + } + + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX, + Count, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI, + Dst, InFlag); + InFlag = Chain.getValue(1); + + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; + Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops); + + if (TwoRepStos) { + InFlag = Chain.getValue(1); + Count = Size; + EVT CVT = Count.getValueType(); + SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, + DAG.getConstant((AVT == MVT::i64) ? 7 : 3, dl, + CVT)); + Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : X86::ECX, + Left, InFlag); + InFlag = Chain.getValue(1); + Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag }; + Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops); + } else if (BytesLeft) { + // Handle the last 1 - 7 bytes. + unsigned Offset = SizeVal - BytesLeft; + EVT AddrVT = Dst.getValueType(); + EVT SizeVT = Size.getValueType(); + + Chain = DAG.getMemset(Chain, dl, + DAG.getNode(ISD::ADD, dl, AddrVT, Dst, + DAG.getConstant(Offset, dl, AddrVT)), + Src, + DAG.getConstant(BytesLeft, dl, SizeVT), + Align, isVolatile, false, + DstPtrInfo.getWithOffset(Offset)); + } + + // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. + return Chain; +} + +SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( + SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { + // This requires the copy size to be a constant, preferably + // within a subtarget-specific limit. + ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); + const X86Subtarget &Subtarget = + DAG.getMachineFunction().getSubtarget<X86Subtarget>(); + if (!ConstantSize) + return SDValue(); + uint64_t SizeVal = ConstantSize->getZExtValue(); + if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold()) + return SDValue(); + + /// If not DWORD aligned, it is more efficient to call the library. However + /// if calling the library is not allowed (AlwaysInline), then soldier on as + /// the code generated here is better than the long load-store sequence we + /// would otherwise get. + if (!AlwaysInline && (Align & 3) != 0) + return SDValue(); + + // If to a segment-relative address space, use the default lowering. + if (DstPtrInfo.getAddrSpace() >= 256 || + SrcPtrInfo.getAddrSpace() >= 256) + return SDValue(); + + // If the base register might conflict with our physical registers, bail out. + const unsigned ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI, + X86::ECX, X86::ESI, X86::EDI}; + if (isBaseRegConflictPossible(DAG, ClobberSet)) + return SDValue(); + + MVT AVT; + if (Align & 1) + AVT = MVT::i8; + else if (Align & 2) + AVT = MVT::i16; + else if (Align & 4) + // DWORD aligned + AVT = MVT::i32; + else + // QWORD aligned + AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32; + + unsigned UBytes = AVT.getSizeInBits() / 8; + unsigned CountVal = SizeVal / UBytes; + SDValue Count = DAG.getIntPtrConstant(CountVal, dl); + unsigned BytesLeft = SizeVal % UBytes; + + SDValue InFlag; + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX, + Count, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI, + Dst, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : X86::ESI, + Src, InFlag); + InFlag = Chain.getValue(1); + + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; + SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops); + + SmallVector<SDValue, 4> Results; + Results.push_back(RepMovs); + if (BytesLeft) { + // Handle the last 1 - 7 bytes. + unsigned Offset = SizeVal - BytesLeft; + EVT DstVT = Dst.getValueType(); + EVT SrcVT = Src.getValueType(); + EVT SizeVT = Size.getValueType(); + Results.push_back(DAG.getMemcpy(Chain, dl, + DAG.getNode(ISD::ADD, dl, DstVT, Dst, + DAG.getConstant(Offset, dl, + DstVT)), + DAG.getNode(ISD::ADD, dl, SrcVT, Src, + DAG.getConstant(Offset, dl, + SrcVT)), + DAG.getConstant(BytesLeft, dl, SizeVT), + Align, isVolatile, AlwaysInline, false, + DstPtrInfo.getWithOffset(Offset), + SrcPtrInfo.getWithOffset(Offset))); + } + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results); +} diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h new file mode 100644 index 0000000..961bd8c --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h @@ -0,0 +1,52 @@ +//===-- X86SelectionDAGInfo.h - X86 SelectionDAG Info -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the X86 subclass for TargetSelectionDAGInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H +#define LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H + +#include "llvm/Target/TargetSelectionDAGInfo.h" + +namespace llvm { + +class X86TargetLowering; +class X86TargetMachine; +class X86Subtarget; + +class X86SelectionDAGInfo : public TargetSelectionDAGInfo { + /// Returns true if it is possible for the base register to conflict with the + /// given set of clobbers for a memory intrinsic. + bool isBaseRegConflictPossible(SelectionDAG &DAG, + ArrayRef<unsigned> ClobberSet) const; + +public: + explicit X86SelectionDAGInfo() = default; + + SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + bool isVolatile, + MachinePointerInfo DstPtrInfo) const override; + + SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const override; +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp new file mode 100644 index 0000000..ef16c5b --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp @@ -0,0 +1,190 @@ +//===-- X86ShuffleDecodeConstantPool.cpp - X86 shuffle decode -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Define several functions to decode x86 specific shuffle semantics using +// constants from the constant pool. +// +//===----------------------------------------------------------------------===// + +#include "X86ShuffleDecodeConstantPool.h" +#include "Utils/X86ShuffleDecode.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/IR/Constants.h" + +//===----------------------------------------------------------------------===// +// Vector Mask Decoding +//===----------------------------------------------------------------------===// + +namespace llvm { + +void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + // It is not an error for the PSHUFB mask to not be a vector of i8 because the + // constant pool uniques constants by their bit representation. + // e.g. the following take up the same space in the constant pool: + // i128 -170141183420855150465331762880109871104 + // + // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160> + // + // <4 x i32> <i32 -2147483648, i32 -2147483648, + // i32 -2147483648, i32 -2147483648> + +#ifndef NDEBUG + unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); + assert(MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512); +#endif + + // This is a straightforward byte vector. + if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) { + int NumElements = MaskTy->getVectorNumElements(); + ShuffleMask.reserve(NumElements); + + for (int i = 0; i < NumElements; ++i) { + // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte + // lane of the vector we're inside. + int Base = i & ~0xf; + Constant *COp = C->getAggregateElement(i); + if (!COp) { + ShuffleMask.clear(); + return; + } else if (isa<UndefValue>(COp)) { + ShuffleMask.push_back(SM_SentinelUndef); + continue; + } + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + // If the high bit (7) of the byte is set, the element is zeroed. + if (Element & (1 << 7)) + ShuffleMask.push_back(SM_SentinelZero); + else { + // Only the least significant 4 bits of the byte are used. + int Index = Base + (Element & 0xf); + ShuffleMask.push_back(Index); + } + } + } + // TODO: Handle funny-looking vectors too. +} + +void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, + SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + // It is not an error for the PSHUFB mask to not be a vector of i8 because the + // constant pool uniques constants by their bit representation. + // e.g. the following take up the same space in the constant pool: + // i128 -170141183420855150465331762880109871104 + // + // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160> + // + // <4 x i32> <i32 -2147483648, i32 -2147483648, + // i32 -2147483648, i32 -2147483648> + + unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); + + if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512. + return; + + // Only support vector types. + if (!MaskTy->isVectorTy()) + return; + + // Make sure its an integer type. + Type *VecEltTy = MaskTy->getVectorElementType(); + if (!VecEltTy->isIntegerTy()) + return; + + // Support any element type from byte up to element size. + // This is necesary primarily because 64-bit elements get split to 32-bit + // in the constant pool on 32-bit target. + unsigned EltTySize = VecEltTy->getIntegerBitWidth(); + if (EltTySize < 8 || EltTySize > ElSize) + return; + + unsigned NumElements = MaskTySize / ElSize; + assert((NumElements == 2 || NumElements == 4 || NumElements == 8) && + "Unexpected number of vector elements."); + ShuffleMask.reserve(NumElements); + unsigned NumElementsPerLane = 128 / ElSize; + unsigned Factor = ElSize / EltTySize; + + for (unsigned i = 0; i < NumElements; ++i) { + Constant *COp = C->getAggregateElement(i * Factor); + if (!COp) { + ShuffleMask.clear(); + return; + } else if (isa<UndefValue>(COp)) { + ShuffleMask.push_back(SM_SentinelUndef); + continue; + } + int Index = i & ~(NumElementsPerLane - 1); + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + if (ElSize == 64) + Index += (Element >> 1) & 0x1; + else + Index += Element & 0x3; + ShuffleMask.push_back(Index); + } + + // TODO: Handle funny-looking vectors too. +} + +void DecodeVPERMVMask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + if (MaskTy->isVectorTy()) { + unsigned NumElements = MaskTy->getVectorNumElements(); + if (NumElements == VT.getVectorNumElements()) { + for (unsigned i = 0; i < NumElements; ++i) { + Constant *COp = C->getAggregateElement(i); + if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) { + ShuffleMask.clear(); + return; + } + if (isa<UndefValue>(COp)) + ShuffleMask.push_back(SM_SentinelUndef); + else { + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + Element &= (1 << NumElements) - 1; + ShuffleMask.push_back(Element); + } + } + } + return; + } + // Scalar value; just broadcast it + if (!isa<ConstantInt>(C)) + return; + uint64_t Element = cast<ConstantInt>(C)->getZExtValue(); + int NumElements = VT.getVectorNumElements(); + Element &= (1 << NumElements) - 1; + for (int i = 0; i < NumElements; ++i) + ShuffleMask.push_back(Element); +} + +void DecodeVPERMV3Mask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + unsigned NumElements = MaskTy->getVectorNumElements(); + if (NumElements == VT.getVectorNumElements()) { + for (unsigned i = 0; i < NumElements; ++i) { + Constant *COp = C->getAggregateElement(i); + if (!COp) { + ShuffleMask.clear(); + return; + } + if (isa<UndefValue>(COp)) + ShuffleMask.push_back(SM_SentinelUndef); + else { + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + Element &= (1 << NumElements*2) - 1; + ShuffleMask.push_back(Element); + } + } + } +} +} // llvm namespace diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h new file mode 100644 index 0000000..bcf4632 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h @@ -0,0 +1,45 @@ +//===-- X86ShuffleDecodeConstantPool.h - X86 shuffle decode -----*-C++-*---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Define several functions to decode x86 specific shuffle semantics using +// constants from the constant pool. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H +#define LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H + +#include "llvm/ADT/SmallVector.h" + +//===----------------------------------------------------------------------===// +// Vector Mask Decoding +//===----------------------------------------------------------------------===// + +namespace llvm { +class Constant; +class MVT; + +/// \brief Decode a PSHUFB mask from an IR-level vector constant. +void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERMILP variable mask from an IR-level vector constant. +void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant. +void DecodeVPERMVMask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant. +void DecodeVPERMV3Mask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask); + +} // llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp new file mode 100644 index 0000000..8ef08c9 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp @@ -0,0 +1,343 @@ +//===-- X86Subtarget.cpp - X86 Subtarget Information ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the X86 specific subclass of TargetSubtargetInfo. +// +//===----------------------------------------------------------------------===// + +#include "X86Subtarget.h" +#include "X86InstrInfo.h" +#include "X86TargetMachine.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Host.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" + +#if defined(_MSC_VER) +#include <intrin.h> +#endif + +using namespace llvm; + +#define DEBUG_TYPE "subtarget" + +#define GET_SUBTARGETINFO_TARGET_DESC +#define GET_SUBTARGETINFO_CTOR +#include "X86GenSubtargetInfo.inc" + +// Temporary option to control early if-conversion for x86 while adding machine +// models. +static cl::opt<bool> +X86EarlyIfConv("x86-early-ifcvt", cl::Hidden, + cl::desc("Enable early if-conversion on X86")); + + +/// Classify a blockaddress reference for the current subtarget according to how +/// we should reference it in a non-pcrel context. +unsigned char X86Subtarget::ClassifyBlockAddressReference() const { + if (isPICStyleGOT()) // 32-bit ELF targets. + return X86II::MO_GOTOFF; + + if (isPICStyleStubPIC()) // Darwin/32 in PIC mode. + return X86II::MO_PIC_BASE_OFFSET; + + // Direct static reference to label. + return X86II::MO_NO_FLAG; +} + +/// Classify a global variable reference for the current subtarget according to +/// how we should reference it in a non-pcrel context. +unsigned char X86Subtarget:: +ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { + // DLLImport only exists on windows, it is implemented as a load from a + // DLLIMPORT stub. + if (GV->hasDLLImportStorageClass()) + return X86II::MO_DLLIMPORT; + + bool isDef = GV->isStrongDefinitionForLinker(); + + // X86-64 in PIC mode. + if (isPICStyleRIPRel()) { + // Large model never uses stubs. + if (TM.getCodeModel() == CodeModel::Large) + return X86II::MO_NO_FLAG; + + if (isTargetDarwin()) { + // If symbol visibility is hidden, the extra load is not needed if + // target is x86-64 or the symbol is definitely defined in the current + // translation unit. + if (GV->hasDefaultVisibility() && !isDef) + return X86II::MO_GOTPCREL; + } else if (!isTargetWin64()) { + assert(isTargetELF() && "Unknown rip-relative target"); + + // Extra load is needed for all externally visible. + if (!GV->hasLocalLinkage() && GV->hasDefaultVisibility()) + return X86II::MO_GOTPCREL; + } + + return X86II::MO_NO_FLAG; + } + + if (isPICStyleGOT()) { // 32-bit ELF targets. + // Extra load is needed for all externally visible. + if (GV->hasLocalLinkage() || GV->hasHiddenVisibility()) + return X86II::MO_GOTOFF; + return X86II::MO_GOT; + } + + if (isPICStyleStubPIC()) { // Darwin/32 in PIC mode. + // Determine whether we have a stub reference and/or whether the reference + // is relative to the PIC base or not. + + // If this is a strong reference to a definition, it is definitely not + // through a stub. + if (isDef) + return X86II::MO_PIC_BASE_OFFSET; + + // Unless we have a symbol with hidden visibility, we have to go through a + // normal $non_lazy_ptr stub because this symbol might be resolved late. + if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference. + return X86II::MO_DARWIN_NONLAZY_PIC_BASE; + + // If symbol visibility is hidden, we have a stub for common symbol + // references and external declarations. + if (GV->isDeclarationForLinker() || GV->hasCommonLinkage()) { + // Hidden $non_lazy_ptr reference. + return X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE; + } + + // Otherwise, no stub. + return X86II::MO_PIC_BASE_OFFSET; + } + + if (isPICStyleStubNoDynamic()) { // Darwin/32 in -mdynamic-no-pic mode. + // Determine whether we have a stub reference. + + // If this is a strong reference to a definition, it is definitely not + // through a stub. + if (isDef) + return X86II::MO_NO_FLAG; + + // Unless we have a symbol with hidden visibility, we have to go through a + // normal $non_lazy_ptr stub because this symbol might be resolved late. + if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference. + return X86II::MO_DARWIN_NONLAZY; + + // Otherwise, no stub. + return X86II::MO_NO_FLAG; + } + + // Direct static reference to global. + return X86II::MO_NO_FLAG; +} + + +/// This function returns the name of a function which has an interface like +/// the non-standard bzero function, if such a function exists on the +/// current subtarget and it is considered preferable over memset with zero +/// passed as the second argument. Otherwise it returns null. +const char *X86Subtarget::getBZeroEntry() const { + // Darwin 10 has a __bzero entry point for this purpose. + if (getTargetTriple().isMacOSX() && + !getTargetTriple().isMacOSXVersionLT(10, 6)) + return "__bzero"; + + return nullptr; +} + +bool X86Subtarget::hasSinCos() const { + return getTargetTriple().isMacOSX() && + !getTargetTriple().isMacOSXVersionLT(10, 9) && + is64Bit(); +} + +/// Return true if the subtarget allows calls to immediate address. +bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const { + // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32 + // but WinCOFFObjectWriter::RecordRelocation cannot emit them. Once it does, + // the following check for Win32 should be removed. + if (In64BitMode || isTargetWin32()) + return false; + return isTargetELF() || TM.getRelocationModel() == Reloc::Static; +} + +void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { + std::string CPUName = CPU; + if (CPUName.empty()) + CPUName = "generic"; + + // Make sure 64-bit features are available in 64-bit mode. (But make sure + // SSE2 can be turned off explicitly.) + std::string FullFS = FS; + if (In64BitMode) { + if (!FullFS.empty()) + FullFS = "+64bit,+sse2," + FullFS; + else + FullFS = "+64bit,+sse2"; + } + + // LAHF/SAHF are always supported in non-64-bit mode. + if (!In64BitMode) { + if (!FullFS.empty()) + FullFS = "+sahf," + FullFS; + else + FullFS = "+sahf"; + } + + + // Parse features string and set the CPU. + ParseSubtargetFeatures(CPUName, FullFS); + + // All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of + // 16-bytes and under that are reasonably fast. These features were + // introduced with Intel's Nehalem/Silvermont and AMD's Family10h + // micro-architectures respectively. + if (hasSSE42() || hasSSE4A()) + IsUAMem16Slow = false; + + InstrItins = getInstrItineraryForCPU(CPUName); + + // It's important to keep the MCSubtargetInfo feature bits in sync with + // target data structure which is shared with MC code emitter, etc. + if (In64BitMode) + ToggleFeature(X86::Mode64Bit); + else if (In32BitMode) + ToggleFeature(X86::Mode32Bit); + else if (In16BitMode) + ToggleFeature(X86::Mode16Bit); + else + llvm_unreachable("Not 16-bit, 32-bit or 64-bit mode!"); + + DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel + << ", 3DNowLevel " << X863DNowLevel + << ", 64bit " << HasX86_64 << "\n"); + assert((!In64BitMode || HasX86_64) && + "64-bit code requested on a subtarget that doesn't support it!"); + + // Stack alignment is 16 bytes on Darwin, Linux and Solaris (both + // 32 and 64 bit) and for all 64-bit targets. + if (StackAlignOverride) + stackAlignment = StackAlignOverride; + else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() || + In64BitMode) + stackAlignment = 16; +} + +void X86Subtarget::initializeEnvironment() { + X86SSELevel = NoSSE; + X863DNowLevel = NoThreeDNow; + HasCMov = false; + HasX86_64 = false; + HasPOPCNT = false; + HasSSE4A = false; + HasAES = false; + HasFXSR = false; + HasXSAVE = false; + HasXSAVEOPT = false; + HasXSAVEC = false; + HasXSAVES = false; + HasPCLMUL = false; + HasFMA = false; + HasFMA4 = false; + HasXOP = false; + HasTBM = false; + HasMOVBE = false; + HasRDRAND = false; + HasF16C = false; + HasFSGSBase = false; + HasLZCNT = false; + HasBMI = false; + HasBMI2 = false; + HasRTM = false; + HasHLE = false; + HasERI = false; + HasCDI = false; + HasPFI = false; + HasDQI = false; + HasBWI = false; + HasVLX = false; + HasADX = false; + HasPKU = false; + HasSHA = false; + HasPRFCHW = false; + HasRDSEED = false; + HasLAHFSAHF = false; + HasMPX = false; + IsBTMemSlow = false; + IsSHLDSlow = false; + IsUAMem16Slow = false; + IsUAMem32Slow = false; + HasSSEUnalignedMem = false; + HasCmpxchg16b = false; + UseLeaForSP = false; + HasSlowDivide32 = false; + HasSlowDivide64 = false; + PadShortFunctions = false; + CallRegIndirect = false; + LEAUsesAG = false; + SlowLEA = false; + SlowIncDec = false; + stackAlignment = 4; + // FIXME: this is a known good value for Yonah. How about others? + MaxInlineSizeThreshold = 128; + UseSoftFloat = false; +} + +X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU, + StringRef FS) { + initializeEnvironment(); + initSubtargetFeatures(CPU, FS); + return *this; +} + +X86Subtarget::X86Subtarget(const Triple &TT, const std::string &CPU, + const std::string &FS, const X86TargetMachine &TM, + unsigned StackAlignOverride) + : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others), + PICStyle(PICStyles::None), TargetTriple(TT), + StackAlignOverride(StackAlignOverride), + In64BitMode(TargetTriple.getArch() == Triple::x86_64), + In32BitMode(TargetTriple.getArch() == Triple::x86 && + TargetTriple.getEnvironment() != Triple::CODE16), + In16BitMode(TargetTriple.getArch() == Triple::x86 && + TargetTriple.getEnvironment() == Triple::CODE16), + TSInfo(), InstrInfo(initializeSubtargetDependencies(CPU, FS)), + TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) { + // Determine the PICStyle based on the target selected. + if (TM.getRelocationModel() == Reloc::Static) { + // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None. + setPICStyle(PICStyles::None); + } else if (is64Bit()) { + // PIC in 64 bit mode is always rip-rel. + setPICStyle(PICStyles::RIPRel); + } else if (isTargetCOFF()) { + setPICStyle(PICStyles::None); + } else if (isTargetDarwin()) { + if (TM.getRelocationModel() == Reloc::PIC_) + setPICStyle(PICStyles::StubPIC); + else { + assert(TM.getRelocationModel() == Reloc::DynamicNoPIC); + setPICStyle(PICStyles::StubDynamicNoPIC); + } + } else if (isTargetELF()) { + setPICStyle(PICStyles::GOT); + } +} + +bool X86Subtarget::enableEarlyIfConversion() const { + return hasCMov() && X86EarlyIfConv; +} + diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h new file mode 100644 index 0000000..13d1026 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h @@ -0,0 +1,546 @@ +//===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the X86 specific subclass of TargetSubtargetInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86SUBTARGET_H +#define LLVM_LIB_TARGET_X86_X86SUBTARGET_H + +#include "X86FrameLowering.h" +#include "X86ISelLowering.h" +#include "X86InstrInfo.h" +#include "X86SelectionDAGInfo.h" +#include "llvm/ADT/Triple.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <string> + +#define GET_SUBTARGETINFO_HEADER +#include "X86GenSubtargetInfo.inc" + +namespace llvm { +class GlobalValue; +class StringRef; +class TargetMachine; + +/// The X86 backend supports a number of different styles of PIC. +/// +namespace PICStyles { +enum Style { + StubPIC, // Used on i386-darwin in -fPIC mode. + StubDynamicNoPIC, // Used on i386-darwin in -mdynamic-no-pic mode. + GOT, // Used on many 32-bit unices in -fPIC mode. + RIPRel, // Used on X86-64 when not in -static mode. + None // Set when in -static mode (not PIC or DynamicNoPIC mode). +}; +} + +class X86Subtarget final : public X86GenSubtargetInfo { + +protected: + enum X86SSEEnum { + NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F + }; + + enum X863DNowEnum { + NoThreeDNow, MMX, ThreeDNow, ThreeDNowA + }; + + enum X86ProcFamilyEnum { + Others, IntelAtom, IntelSLM + }; + + /// X86 processor family: Intel Atom, and others + X86ProcFamilyEnum X86ProcFamily; + + /// Which PIC style to use + PICStyles::Style PICStyle; + + /// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported. + X86SSEEnum X86SSELevel; + + /// MMX, 3DNow, 3DNow Athlon, or none supported. + X863DNowEnum X863DNowLevel; + + /// True if this processor has conditional move instructions + /// (generally pentium pro+). + bool HasCMov; + + /// True if the processor supports X86-64 instructions. + bool HasX86_64; + + /// True if the processor supports POPCNT. + bool HasPOPCNT; + + /// True if the processor supports SSE4A instructions. + bool HasSSE4A; + + /// Target has AES instructions + bool HasAES; + + /// Target has FXSAVE/FXRESTOR instructions + bool HasFXSR; + + /// Target has XSAVE instructions + bool HasXSAVE; + /// Target has XSAVEOPT instructions + bool HasXSAVEOPT; + /// Target has XSAVEC instructions + bool HasXSAVEC; + /// Target has XSAVES instructions + bool HasXSAVES; + + /// Target has carry-less multiplication + bool HasPCLMUL; + + /// Target has 3-operand fused multiply-add + bool HasFMA; + + /// Target has 4-operand fused multiply-add + bool HasFMA4; + + /// Target has XOP instructions + bool HasXOP; + + /// Target has TBM instructions. + bool HasTBM; + + /// True if the processor has the MOVBE instruction. + bool HasMOVBE; + + /// True if the processor has the RDRAND instruction. + bool HasRDRAND; + + /// Processor has 16-bit floating point conversion instructions. + bool HasF16C; + + /// Processor has FS/GS base insturctions. + bool HasFSGSBase; + + /// Processor has LZCNT instruction. + bool HasLZCNT; + + /// Processor has BMI1 instructions. + bool HasBMI; + + /// Processor has BMI2 instructions. + bool HasBMI2; + + /// Processor has RTM instructions. + bool HasRTM; + + /// Processor has HLE. + bool HasHLE; + + /// Processor has ADX instructions. + bool HasADX; + + /// Processor has SHA instructions. + bool HasSHA; + + /// Processor has PRFCHW instructions. + bool HasPRFCHW; + + /// Processor has RDSEED instructions. + bool HasRDSEED; + + /// Processor has LAHF/SAHF instructions. + bool HasLAHFSAHF; + + /// True if BT (bit test) of memory instructions are slow. + bool IsBTMemSlow; + + /// True if SHLD instructions are slow. + bool IsSHLDSlow; + + /// True if unaligned memory accesses of 16-bytes are slow. + bool IsUAMem16Slow; + + /// True if unaligned memory accesses of 32-bytes are slow. + bool IsUAMem32Slow; + + /// True if SSE operations can have unaligned memory operands. + /// This may require setting a configuration bit in the processor. + bool HasSSEUnalignedMem; + + /// True if this processor has the CMPXCHG16B instruction; + /// this is true for most x86-64 chips, but not the first AMD chips. + bool HasCmpxchg16b; + + /// True if the LEA instruction should be used for adjusting + /// the stack pointer. This is an optimization for Intel Atom processors. + bool UseLeaForSP; + + /// True if 8-bit divisions are significantly faster than + /// 32-bit divisions and should be used when possible. + bool HasSlowDivide32; + + /// True if 16-bit divides are significantly faster than + /// 64-bit divisions and should be used when possible. + bool HasSlowDivide64; + + /// True if the short functions should be padded to prevent + /// a stall when returning too early. + bool PadShortFunctions; + + /// True if the Calls with memory reference should be converted + /// to a register-based indirect call. + bool CallRegIndirect; + + /// True if the LEA instruction inputs have to be ready at address generation + /// (AG) time. + bool LEAUsesAG; + + /// True if the LEA instruction with certain arguments is slow + bool SlowLEA; + + /// True if INC and DEC instructions are slow when writing to flags + bool SlowIncDec; + + /// Processor has AVX-512 PreFetch Instructions + bool HasPFI; + + /// Processor has AVX-512 Exponential and Reciprocal Instructions + bool HasERI; + + /// Processor has AVX-512 Conflict Detection Instructions + bool HasCDI; + + /// Processor has AVX-512 Doubleword and Quadword instructions + bool HasDQI; + + /// Processor has AVX-512 Byte and Word instructions + bool HasBWI; + + /// Processor has AVX-512 Vector Length eXtenstions + bool HasVLX; + + /// Processor has PKU extenstions + bool HasPKU; + + /// Processot supports MPX - Memory Protection Extensions + bool HasMPX; + + /// Use software floating point for code generation. + bool UseSoftFloat; + + /// The minimum alignment known to hold of the stack frame on + /// entry to the function and which must be maintained by every function. + unsigned stackAlignment; + + /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops. + /// + unsigned MaxInlineSizeThreshold; + + /// What processor and OS we're targeting. + Triple TargetTriple; + + /// Instruction itineraries for scheduling + InstrItineraryData InstrItins; + +private: + + /// Override the stack alignment. + unsigned StackAlignOverride; + + /// True if compiling for 64-bit, false for 16-bit or 32-bit. + bool In64BitMode; + + /// True if compiling for 32-bit, false for 16-bit or 64-bit. + bool In32BitMode; + + /// True if compiling for 16-bit, false for 32-bit or 64-bit. + bool In16BitMode; + + X86SelectionDAGInfo TSInfo; + // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which + // X86TargetLowering needs. + X86InstrInfo InstrInfo; + X86TargetLowering TLInfo; + X86FrameLowering FrameLowering; + +public: + /// This constructor initializes the data members to match that + /// of the specified triple. + /// + X86Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, + const X86TargetMachine &TM, unsigned StackAlignOverride); + + const X86TargetLowering *getTargetLowering() const override { + return &TLInfo; + } + const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; } + const X86FrameLowering *getFrameLowering() const override { + return &FrameLowering; + } + const X86SelectionDAGInfo *getSelectionDAGInfo() const override { + return &TSInfo; + } + const X86RegisterInfo *getRegisterInfo() const override { + return &getInstrInfo()->getRegisterInfo(); + } + + /// Returns the minimum alignment known to hold of the + /// stack frame on entry to the function and which must be maintained by every + /// function for this subtarget. + unsigned getStackAlignment() const { return stackAlignment; } + + /// Returns the maximum memset / memcpy size + /// that still makes it profitable to inline the call. + unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; } + + /// ParseSubtargetFeatures - Parses features string setting specified + /// subtarget options. Definition of function is auto generated by tblgen. + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + +private: + /// Initialize the full set of dependencies so we can use an initializer + /// list for X86Subtarget. + X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); + void initializeEnvironment(); + void initSubtargetFeatures(StringRef CPU, StringRef FS); +public: + /// Is this x86_64? (disregarding specific ABI / programming model) + bool is64Bit() const { + return In64BitMode; + } + + bool is32Bit() const { + return In32BitMode; + } + + bool is16Bit() const { + return In16BitMode; + } + + /// Is this x86_64 with the ILP32 programming model (x32 ABI)? + bool isTarget64BitILP32() const { + return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32 || + TargetTriple.isOSNaCl()); + } + + /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)? + bool isTarget64BitLP64() const { + return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32 && + !TargetTriple.isOSNaCl()); + } + + PICStyles::Style getPICStyle() const { return PICStyle; } + void setPICStyle(PICStyles::Style Style) { PICStyle = Style; } + + bool hasCMov() const { return HasCMov; } + bool hasSSE1() const { return X86SSELevel >= SSE1; } + bool hasSSE2() const { return X86SSELevel >= SSE2; } + bool hasSSE3() const { return X86SSELevel >= SSE3; } + bool hasSSSE3() const { return X86SSELevel >= SSSE3; } + bool hasSSE41() const { return X86SSELevel >= SSE41; } + bool hasSSE42() const { return X86SSELevel >= SSE42; } + bool hasAVX() const { return X86SSELevel >= AVX; } + bool hasAVX2() const { return X86SSELevel >= AVX2; } + bool hasAVX512() const { return X86SSELevel >= AVX512F; } + bool hasFp256() const { return hasAVX(); } + bool hasInt256() const { return hasAVX2(); } + bool hasSSE4A() const { return HasSSE4A; } + bool hasMMX() const { return X863DNowLevel >= MMX; } + bool has3DNow() const { return X863DNowLevel >= ThreeDNow; } + bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; } + bool hasPOPCNT() const { return HasPOPCNT; } + bool hasAES() const { return HasAES; } + bool hasFXSR() const { return HasFXSR; } + bool hasXSAVE() const { return HasXSAVE; } + bool hasXSAVEOPT() const { return HasXSAVEOPT; } + bool hasXSAVEC() const { return HasXSAVEC; } + bool hasXSAVES() const { return HasXSAVES; } + bool hasPCLMUL() const { return HasPCLMUL; } + // Prefer FMA4 to FMA - its better for commutation/memory folding and + // has equal or better performance on all supported targets. + bool hasFMA() const { return HasFMA && !HasFMA4; } + bool hasFMA4() const { return HasFMA4; } + bool hasAnyFMA() const { return hasFMA() || hasFMA4() || hasAVX512(); } + bool hasXOP() const { return HasXOP; } + bool hasTBM() const { return HasTBM; } + bool hasMOVBE() const { return HasMOVBE; } + bool hasRDRAND() const { return HasRDRAND; } + bool hasF16C() const { return HasF16C; } + bool hasFSGSBase() const { return HasFSGSBase; } + bool hasLZCNT() const { return HasLZCNT; } + bool hasBMI() const { return HasBMI; } + bool hasBMI2() const { return HasBMI2; } + bool hasRTM() const { return HasRTM; } + bool hasHLE() const { return HasHLE; } + bool hasADX() const { return HasADX; } + bool hasSHA() const { return HasSHA; } + bool hasPRFCHW() const { return HasPRFCHW; } + bool hasRDSEED() const { return HasRDSEED; } + bool hasLAHFSAHF() const { return HasLAHFSAHF; } + bool isBTMemSlow() const { return IsBTMemSlow; } + bool isSHLDSlow() const { return IsSHLDSlow; } + bool isUnalignedMem16Slow() const { return IsUAMem16Slow; } + bool isUnalignedMem32Slow() const { return IsUAMem32Slow; } + bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } + bool hasCmpxchg16b() const { return HasCmpxchg16b; } + bool useLeaForSP() const { return UseLeaForSP; } + bool hasSlowDivide32() const { return HasSlowDivide32; } + bool hasSlowDivide64() const { return HasSlowDivide64; } + bool padShortFunctions() const { return PadShortFunctions; } + bool callRegIndirect() const { return CallRegIndirect; } + bool LEAusesAG() const { return LEAUsesAG; } + bool slowLEA() const { return SlowLEA; } + bool slowIncDec() const { return SlowIncDec; } + bool hasCDI() const { return HasCDI; } + bool hasPFI() const { return HasPFI; } + bool hasERI() const { return HasERI; } + bool hasDQI() const { return HasDQI; } + bool hasBWI() const { return HasBWI; } + bool hasVLX() const { return HasVLX; } + bool hasPKU() const { return HasPKU; } + bool hasMPX() const { return HasMPX; } + + bool isAtom() const { return X86ProcFamily == IntelAtom; } + bool isSLM() const { return X86ProcFamily == IntelSLM; } + bool useSoftFloat() const { return UseSoftFloat; } + + const Triple &getTargetTriple() const { return TargetTriple; } + + bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } + bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); } + bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); } + bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); } + bool isTargetPS4() const { return TargetTriple.isPS4(); } + + bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } + bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); } + bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } + + bool isTargetLinux() const { return TargetTriple.isOSLinux(); } + bool isTargetAndroid() const { return TargetTriple.isAndroid(); } + bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); } + bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); } + bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); } + bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); } + + bool isTargetWindowsMSVC() const { + return TargetTriple.isWindowsMSVCEnvironment(); + } + + bool isTargetKnownWindowsMSVC() const { + return TargetTriple.isKnownWindowsMSVCEnvironment(); + } + + bool isTargetWindowsCoreCLR() const { + return TargetTriple.isWindowsCoreCLREnvironment(); + } + + bool isTargetWindowsCygwin() const { + return TargetTriple.isWindowsCygwinEnvironment(); + } + + bool isTargetWindowsGNU() const { + return TargetTriple.isWindowsGNUEnvironment(); + } + + bool isTargetWindowsItanium() const { + return TargetTriple.isWindowsItaniumEnvironment(); + } + + bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); } + + bool isOSWindows() const { return TargetTriple.isOSWindows(); } + + bool isTargetWin64() const { + return In64BitMode && TargetTriple.isOSWindows(); + } + + bool isTargetWin32() const { + return !In64BitMode && (isTargetCygMing() || isTargetKnownWindowsMSVC()); + } + + bool isPICStyleSet() const { return PICStyle != PICStyles::None; } + bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; } + bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; } + + bool isPICStyleStubPIC() const { + return PICStyle == PICStyles::StubPIC; + } + + bool isPICStyleStubNoDynamic() const { + return PICStyle == PICStyles::StubDynamicNoPIC; + } + bool isPICStyleStubAny() const { + return PICStyle == PICStyles::StubDynamicNoPIC || + PICStyle == PICStyles::StubPIC; + } + + bool isCallingConvWin64(CallingConv::ID CC) const { + switch (CC) { + // On Win64, all these conventions just use the default convention. + case CallingConv::C: + case CallingConv::Fast: + case CallingConv::X86_FastCall: + case CallingConv::X86_StdCall: + case CallingConv::X86_ThisCall: + case CallingConv::X86_VectorCall: + case CallingConv::Intel_OCL_BI: + return isTargetWin64(); + // This convention allows using the Win64 convention on other targets. + case CallingConv::X86_64_Win64: + return true; + // This convention allows using the SysV convention on Windows targets. + case CallingConv::X86_64_SysV: + return false; + // Otherwise, who knows what this is. + default: + return false; + } + } + + /// ClassifyGlobalReference - Classify a global variable reference for the + /// current subtarget according to how we should reference it in a non-pcrel + /// context. + unsigned char ClassifyGlobalReference(const GlobalValue *GV, + const TargetMachine &TM)const; + + /// Classify a blockaddress reference for the current subtarget according to + /// how we should reference it in a non-pcrel context. + unsigned char ClassifyBlockAddressReference() const; + + /// Return true if the subtarget allows calls to immediate address. + bool IsLegalToCallImmediateAddr(const TargetMachine &TM) const; + + /// This function returns the name of a function which has an interface + /// like the non-standard bzero function, if such a function exists on + /// the current subtarget and it is considered prefereable over + /// memset with zero passed as the second argument. Otherwise it + /// returns null. + const char *getBZeroEntry() const; + + /// This function returns true if the target has sincos() routine in its + /// compiler runtime or math libraries. + bool hasSinCos() const; + + /// Enable the MachineScheduler pass for all X86 subtargets. + bool enableMachineScheduler() const override { return true; } + + bool enableEarlyIfConversion() const override; + + /// Return the instruction itineraries based on the subtarget selection. + const InstrItineraryData *getInstrItineraryData() const override { + return &InstrItins; + } + + AntiDepBreakMode getAntiDepBreakMode() const override { + return TargetSubtargetInfo::ANTIDEP_CRITICAL; + } +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp new file mode 100644 index 0000000..0e7e4c0 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -0,0 +1,280 @@ +//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the X86 specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#include "X86TargetMachine.h" +#include "X86.h" +#include "X86TargetObjectFile.h" +#include "X86TargetTransformInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner", + cl::desc("Enable the machine combiner pass"), + cl::init(true), cl::Hidden); + +namespace llvm { +void initializeWinEHStatePassPass(PassRegistry &); +} + +extern "C" void LLVMInitializeX86Target() { + // Register the target. + RegisterTargetMachine<X86TargetMachine> X(TheX86_32Target); + RegisterTargetMachine<X86TargetMachine> Y(TheX86_64Target); + + PassRegistry &PR = *PassRegistry::getPassRegistry(); + initializeWinEHStatePassPass(PR); +} + +static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { + if (TT.isOSBinFormatMachO()) { + if (TT.getArch() == Triple::x86_64) + return make_unique<X86_64MachoTargetObjectFile>(); + return make_unique<TargetLoweringObjectFileMachO>(); + } + + if (TT.isOSLinux() || TT.isOSNaCl()) + return make_unique<X86LinuxNaClTargetObjectFile>(); + if (TT.isOSBinFormatELF()) + return make_unique<X86ELFTargetObjectFile>(); + if (TT.isKnownWindowsMSVCEnvironment() || TT.isWindowsCoreCLREnvironment()) + return make_unique<X86WindowsTargetObjectFile>(); + if (TT.isOSBinFormatCOFF()) + return make_unique<TargetLoweringObjectFileCOFF>(); + llvm_unreachable("unknown subtarget type"); +} + +static std::string computeDataLayout(const Triple &TT) { + // X86 is little endian + std::string Ret = "e"; + + Ret += DataLayout::getManglingComponent(TT); + // X86 and x32 have 32 bit pointers. + if ((TT.isArch64Bit() && + (TT.getEnvironment() == Triple::GNUX32 || TT.isOSNaCl())) || + !TT.isArch64Bit()) + Ret += "-p:32:32"; + + // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32. + if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl()) + Ret += "-i64:64"; + else + Ret += "-f64:32:64"; + + // Some ABIs align long double to 128 bits, others to 32. + if (TT.isOSNaCl()) + ; // No f80 + else if (TT.isArch64Bit() || TT.isOSDarwin()) + Ret += "-f80:128"; + else + Ret += "-f80:32"; + + // The registers can hold 8, 16, 32 or, in x86-64, 64 bits. + if (TT.isArch64Bit()) + Ret += "-n8:16:32:64"; + else + Ret += "-n8:16:32"; + + // The stack is aligned to 32 bits on some ABIs and 128 bits on others. + if (!TT.isArch64Bit() && TT.isOSWindows()) + Ret += "-a:0:32-S32"; + else + Ret += "-S128"; + + return Ret; +} + +/// X86TargetMachine ctor - Create an X86 target. +/// +X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM, + OL), + TLOF(createTLOF(getTargetTriple())), + Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) { + // Windows stack unwinder gets confused when execution flow "falls through" + // after a call to 'noreturn' function. + // To prevent that, we emit a trap for 'unreachable' IR instructions. + // (which on X86, happens to be the 'ud2' instruction) + if (Subtarget.isTargetWin64()) + this->Options.TrapUnreachable = true; + + // By default (and when -ffast-math is on), enable estimate codegen for + // everything except scalar division. By default, use 1 refinement step for + // all operations. Defaults may be overridden by using command-line options. + // Scalar division estimates are disabled because they break too much + // real-world code. These defaults match GCC behavior. + this->Options.Reciprocals.setDefaults("sqrtf", true, 1); + this->Options.Reciprocals.setDefaults("divf", false, 1); + this->Options.Reciprocals.setDefaults("vec-sqrtf", true, 1); + this->Options.Reciprocals.setDefaults("vec-divf", true, 1); + + initAsmInfo(); +} + +X86TargetMachine::~X86TargetMachine() {} + +const X86Subtarget * +X86TargetMachine::getSubtargetImpl(const Function &F) const { + Attribute CPUAttr = F.getFnAttribute("target-cpu"); + Attribute FSAttr = F.getFnAttribute("target-features"); + + std::string CPU = !CPUAttr.hasAttribute(Attribute::None) + ? CPUAttr.getValueAsString().str() + : TargetCPU; + std::string FS = !FSAttr.hasAttribute(Attribute::None) + ? FSAttr.getValueAsString().str() + : TargetFS; + + // FIXME: This is related to the code below to reset the target options, + // we need to know whether or not the soft float flag is set on the + // function before we can generate a subtarget. We also need to use + // it as a key for the subtarget since that can be the only difference + // between two functions. + bool SoftFloat = + F.hasFnAttribute("use-soft-float") && + F.getFnAttribute("use-soft-float").getValueAsString() == "true"; + // If the soft float attribute is set on the function turn on the soft float + // subtarget feature. + if (SoftFloat) + FS += FS.empty() ? "+soft-float" : ",+soft-float"; + + auto &I = SubtargetMap[CPU + FS]; + if (!I) { + // This needs to be done before we create a new subtarget since any + // creation will depend on the TM and the code generation flags on the + // function that reside in TargetOptions. + resetTargetOptions(F); + I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this, + Options.StackAlignmentOverride); + } + return I.get(); +} + +//===----------------------------------------------------------------------===// +// Command line options for x86 +//===----------------------------------------------------------------------===// +static cl::opt<bool> +UseVZeroUpper("x86-use-vzeroupper", cl::Hidden, + cl::desc("Minimize AVX to SSE transition penalty"), + cl::init(true)); + +//===----------------------------------------------------------------------===// +// X86 TTI query. +//===----------------------------------------------------------------------===// + +TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() { + return TargetIRAnalysis([this](const Function &F) { + return TargetTransformInfo(X86TTIImpl(this, F)); + }); +} + + +//===----------------------------------------------------------------------===// +// Pass Pipeline Configuration +//===----------------------------------------------------------------------===// + +namespace { +/// X86 Code Generator Pass Configuration Options. +class X86PassConfig : public TargetPassConfig { +public: + X86PassConfig(X86TargetMachine *TM, PassManagerBase &PM) + : TargetPassConfig(TM, PM) {} + + X86TargetMachine &getX86TargetMachine() const { + return getTM<X86TargetMachine>(); + } + + void addIRPasses() override; + bool addInstSelector() override; + bool addILPOpts() override; + bool addPreISel() override; + void addPreRegAlloc() override; + void addPostRegAlloc() override; + void addPreEmitPass() override; + void addPreSched2() override; +}; +} // namespace + +TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { + return new X86PassConfig(this, PM); +} + +void X86PassConfig::addIRPasses() { + addPass(createAtomicExpandPass(&getX86TargetMachine())); + + TargetPassConfig::addIRPasses(); +} + +bool X86PassConfig::addInstSelector() { + // Install an instruction selector. + addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel())); + + // For ELF, cleanup any local-dynamic TLS accesses. + if (TM->getTargetTriple().isOSBinFormatELF() && + getOptLevel() != CodeGenOpt::None) + addPass(createCleanupLocalDynamicTLSPass()); + + addPass(createX86GlobalBaseRegPass()); + + return false; +} + +bool X86PassConfig::addILPOpts() { + addPass(&EarlyIfConverterID); + if (EnableMachineCombinerPass) + addPass(&MachineCombinerID); + return true; +} + +bool X86PassConfig::addPreISel() { + // Only add this pass for 32-bit x86 Windows. + const Triple &TT = TM->getTargetTriple(); + if (TT.isOSWindows() && TT.getArch() == Triple::x86) + addPass(createX86WinEHStatePass()); + return true; +} + +void X86PassConfig::addPreRegAlloc() { + if (getOptLevel() != CodeGenOpt::None) + addPass(createX86OptimizeLEAs()); + + addPass(createX86CallFrameOptimization()); +} + +void X86PassConfig::addPostRegAlloc() { + addPass(createX86FloatingPointStackifierPass()); +} + +void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); } + +void X86PassConfig::addPreEmitPass() { + if (getOptLevel() != CodeGenOpt::None) + addPass(createExecutionDependencyFixPass(&X86::VR128RegClass)); + + if (UseVZeroUpper) + addPass(createX86IssueVZeroUpperPass()); + + if (getOptLevel() != CodeGenOpt::None) { + addPass(createX86PadShortFunctions()); + addPass(createX86FixupLEAs()); + } +} diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm/lib/Target/X86/X86TargetMachine.h new file mode 100644 index 0000000..2629556 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.h @@ -0,0 +1,49 @@ +//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the X86 specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H +#define LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class StringRef; + +class X86TargetMachine final : public LLVMTargetMachine { + std::unique_ptr<TargetLoweringObjectFile> TLOF; + X86Subtarget Subtarget; + + mutable StringMap<std::unique_ptr<X86Subtarget>> SubtargetMap; + +public: + X86TargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL); + ~X86TargetMachine() override; + const X86Subtarget *getSubtargetImpl(const Function &F) const override; + + TargetIRAnalysis getTargetIRAnalysis() override; + + // Set up the pass pipeline. + TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + TargetLoweringObjectFile *getObjFileLowering() const override { + return TLOF.get(); + } +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp new file mode 100644 index 0000000..782768d --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp @@ -0,0 +1,175 @@ +//===-- X86TargetObjectFile.cpp - X86 Object Info -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "X86TargetObjectFile.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/IR/Mangler.h" +#include "llvm/IR/Operator.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSectionCOFF.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/COFF.h" +#include "llvm/Support/Dwarf.h" +#include "llvm/Target/TargetLowering.h" + +using namespace llvm; +using namespace dwarf; + +const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference( + const GlobalValue *GV, unsigned Encoding, Mangler &Mang, + const TargetMachine &TM, MachineModuleInfo *MMI, + MCStreamer &Streamer) const { + + // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which + // is an indirect pc-relative reference. + if ((Encoding & DW_EH_PE_indirect) && (Encoding & DW_EH_PE_pcrel)) { + const MCSymbol *Sym = TM.getSymbol(GV, Mang); + const MCExpr *Res = + MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext()); + const MCExpr *Four = MCConstantExpr::create(4, getContext()); + return MCBinaryExpr::createAdd(Res, Four, getContext()); + } + + return TargetLoweringObjectFileMachO::getTTypeGlobalReference( + GV, Encoding, Mang, TM, MMI, Streamer); +} + +MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol( + const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM, + MachineModuleInfo *MMI) const { + return TM.getSymbol(GV, Mang); +} + +const MCExpr *X86_64MachoTargetObjectFile::getIndirectSymViaGOTPCRel( + const MCSymbol *Sym, const MCValue &MV, int64_t Offset, + MachineModuleInfo *MMI, MCStreamer &Streamer) const { + // On Darwin/X86-64, we need to use foo@GOTPCREL+4 to access the got entry + // from a data section. In case there's an additional offset, then use + // foo@GOTPCREL+4+<offset>. + unsigned FinalOff = Offset+MV.getConstant()+4; + const MCExpr *Res = + MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext()); + const MCExpr *Off = MCConstantExpr::create(FinalOff, getContext()); + return MCBinaryExpr::createAdd(Res, Off, getContext()); +} + +const MCExpr *X86ELFTargetObjectFile::getDebugThreadLocalSymbol( + const MCSymbol *Sym) const { + return MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext()); +} + +void +X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx, + const TargetMachine &TM) { + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(TM.Options.UseInitArray); +} + +const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol( + const ConstantExpr *CE, Mangler &Mang, const TargetMachine &TM) const { + // We are looking for the difference of two symbols, need a subtraction + // operation. + const SubOperator *Sub = dyn_cast<SubOperator>(CE); + if (!Sub) + return nullptr; + + // Symbols must first be numbers before we can subtract them, we need to see a + // ptrtoint on both subtraction operands. + const PtrToIntOperator *SubLHS = + dyn_cast<PtrToIntOperator>(Sub->getOperand(0)); + const PtrToIntOperator *SubRHS = + dyn_cast<PtrToIntOperator>(Sub->getOperand(1)); + if (!SubLHS || !SubRHS) + return nullptr; + + // Our symbols should exist in address space zero, cowardly no-op if + // otherwise. + if (SubLHS->getPointerAddressSpace() != 0 || + SubRHS->getPointerAddressSpace() != 0) + return nullptr; + + // Both ptrtoint instructions must wrap global objects: + // - Only global variables are eligible for image relative relocations. + // - The subtrahend refers to the special symbol __ImageBase, a GlobalVariable. + const auto *GOLHS = dyn_cast<GlobalObject>(SubLHS->getPointerOperand()); + const auto *GVRHS = dyn_cast<GlobalVariable>(SubRHS->getPointerOperand()); + if (!GOLHS || !GVRHS) + return nullptr; + + // We expect __ImageBase to be a global variable without a section, externally + // defined. + // + // It should look something like this: @__ImageBase = external constant i8 + if (GVRHS->isThreadLocal() || GVRHS->getName() != "__ImageBase" || + !GVRHS->hasExternalLinkage() || GVRHS->hasInitializer() || + GVRHS->hasSection()) + return nullptr; + + // An image-relative, thread-local, symbol makes no sense. + if (GOLHS->isThreadLocal()) + return nullptr; + + return MCSymbolRefExpr::create(TM.getSymbol(GOLHS, Mang), + MCSymbolRefExpr::VK_COFF_IMGREL32, + getContext()); +} + +static std::string APIntToHexString(const APInt &AI) { + unsigned Width = (AI.getBitWidth() / 8) * 2; + std::string HexString = utohexstr(AI.getLimitedValue(), /*LowerCase=*/true); + unsigned Size = HexString.size(); + assert(Width >= Size && "hex string is too large!"); + HexString.insert(HexString.begin(), Width - Size, '0'); + + return HexString; +} + +static std::string scalarConstantToHexString(const Constant *C) { + Type *Ty = C->getType(); + if (isa<UndefValue>(C)) { + return APIntToHexString(APInt::getNullValue(Ty->getPrimitiveSizeInBits())); + } else if (const auto *CFP = dyn_cast<ConstantFP>(C)) { + return APIntToHexString(CFP->getValueAPF().bitcastToAPInt()); + } else if (const auto *CI = dyn_cast<ConstantInt>(C)) { + return APIntToHexString(CI->getValue()); + } else { + unsigned NumElements; + if (isa<VectorType>(Ty)) + NumElements = Ty->getVectorNumElements(); + else + NumElements = Ty->getArrayNumElements(); + std::string HexString; + for (int I = NumElements - 1, E = -1; I != E; --I) + HexString += scalarConstantToHexString(C->getAggregateElement(I)); + return HexString; + } +} + +MCSection *X86WindowsTargetObjectFile::getSectionForConstant( + const DataLayout &DL, SectionKind Kind, const Constant *C) const { + if (Kind.isMergeableConst() && C) { + const unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_READ | + COFF::IMAGE_SCN_LNK_COMDAT; + std::string COMDATSymName; + if (Kind.isMergeableConst4() || Kind.isMergeableConst8()) + COMDATSymName = "__real@" + scalarConstantToHexString(C); + else if (Kind.isMergeableConst16()) + COMDATSymName = "__xmm@" + scalarConstantToHexString(C); + + if (!COMDATSymName.empty()) + return getContext().getCOFFSection(".rdata", Characteristics, Kind, + COMDATSymName, + COFF::IMAGE_COMDAT_SELECT_ANY); + } + + return TargetLoweringObjectFile::getSectionForConstant(DL, Kind, C); +} diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h new file mode 100644 index 0000000..6b2448c --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h @@ -0,0 +1,67 @@ +//===-- X86TargetObjectFile.h - X86 Object Info -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H +#define LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H + +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/Target/TargetLoweringObjectFile.h" + +namespace llvm { + + /// X86_64MachoTargetObjectFile - This TLOF implementation is used for Darwin + /// x86-64. + class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO { + public: + const MCExpr * + getTTypeGlobalReference(const GlobalValue *GV, unsigned Encoding, + Mangler &Mang, const TargetMachine &TM, + MachineModuleInfo *MMI, + MCStreamer &Streamer) const override; + + // getCFIPersonalitySymbol - The symbol that gets passed to + // .cfi_personality. + MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang, + const TargetMachine &TM, + MachineModuleInfo *MMI) const override; + + const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym, + const MCValue &MV, int64_t Offset, + MachineModuleInfo *MMI, + MCStreamer &Streamer) const override; + }; + + /// \brief This implemenatation is used for X86 ELF targets that don't + /// have a further specialization. + class X86ELFTargetObjectFile : public TargetLoweringObjectFileELF { + /// \brief Describe a TLS variable address within debug info. + const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override; + }; + + /// X86LinuxNaClTargetObjectFile - This implementation is used for linux and + /// Native Client on x86 and x86-64. + class X86LinuxNaClTargetObjectFile : public X86ELFTargetObjectFile { + void Initialize(MCContext &Ctx, const TargetMachine &TM) override; + }; + + /// \brief This implementation is used for Windows targets on x86 and x86-64. + class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF { + const MCExpr * + getExecutableRelativeSymbol(const ConstantExpr *CE, Mangler &Mang, + const TargetMachine &TM) const override; + + /// \brief Given a mergeable constant with the specified size and relocation + /// information, return a section that it should be placed in. + MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind, + const Constant *C) const override; + }; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp new file mode 100644 index 0000000..2e7bbb2 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -0,0 +1,1487 @@ +//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements a TargetTransformInfo analysis pass specific to the +/// X86 target machine. It uses the target's detailed information to provide +/// more precise answers to certain TTI queries, while letting the target +/// independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#include "X86TargetTransformInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/CostTable.h" +#include "llvm/Target/TargetLowering.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86tti" + +//===----------------------------------------------------------------------===// +// +// X86 cost model. +// +//===----------------------------------------------------------------------===// + +TargetTransformInfo::PopcntSupportKind +X86TTIImpl::getPopcntSupport(unsigned TyWidth) { + assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); + // TODO: Currently the __builtin_popcount() implementation using SSE3 + // instructions is inefficient. Once the problem is fixed, we should + // call ST->hasSSE3() instead of ST->hasPOPCNT(). + return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; +} + +unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { + if (Vector && !ST->hasSSE1()) + return 0; + + if (ST->is64Bit()) { + if (Vector && ST->hasAVX512()) + return 32; + return 16; + } + return 8; +} + +unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) { + if (Vector) { + if (ST->hasAVX512()) return 512; + if (ST->hasAVX()) return 256; + if (ST->hasSSE1()) return 128; + return 0; + } + + if (ST->is64Bit()) + return 64; + + return 32; +} + +unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { + // If the loop will not be vectorized, don't interleave the loop. + // Let regular unroll to unroll the loop, which saves the overflow + // check and memory check cost. + if (VF == 1) + return 1; + + if (ST->isAtom()) + return 1; + + // Sandybridge and Haswell have multiple execution ports and pipelined + // vector units. + if (ST->hasAVX()) + return 4; + + return 2; +} + +int X86TTIImpl::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, + TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo) { + // Legalize the type. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + if (ISD == ISD::SDIV && + Op2Info == TargetTransformInfo::OK_UniformConstantValue && + Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { + // On X86, vector signed division by constants power-of-two are + // normally expanded to the sequence SRA + SRL + ADD + SRA. + // The OperandValue properties many not be same as that of previous + // operation;conservatively assume OP_None. + int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, + Op2Info, TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + + return Cost; + } + + static const CostTblEntry AVX2UniformConstCostTable[] = { + { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. + + { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence + { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence + { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence + { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence + }; + + if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && + ST->hasAVX2()) { + if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry AVX512CostTable[] = { + { ISD::SHL, MVT::v16i32, 1 }, + { ISD::SRL, MVT::v16i32, 1 }, + { ISD::SRA, MVT::v16i32, 1 }, + { ISD::SHL, MVT::v8i64, 1 }, + { ISD::SRL, MVT::v8i64, 1 }, + { ISD::SRA, MVT::v8i64, 1 }, + }; + + if (ST->hasAVX512()) { + if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry AVX2CostTable[] = { + // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to + // customize them to detect the cases where shift amount is a scalar one. + { ISD::SHL, MVT::v4i32, 1 }, + { ISD::SRL, MVT::v4i32, 1 }, + { ISD::SRA, MVT::v4i32, 1 }, + { ISD::SHL, MVT::v8i32, 1 }, + { ISD::SRL, MVT::v8i32, 1 }, + { ISD::SRA, MVT::v8i32, 1 }, + { ISD::SHL, MVT::v2i64, 1 }, + { ISD::SRL, MVT::v2i64, 1 }, + { ISD::SHL, MVT::v4i64, 1 }, + { ISD::SRL, MVT::v4i64, 1 }, + }; + + // Look for AVX2 lowering tricks. + if (ST->hasAVX2()) { + if (ISD == ISD::SHL && LT.second == MVT::v16i16 && + (Op2Info == TargetTransformInfo::OK_UniformConstantValue || + Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) + // On AVX2, a packed v16i16 shift left by a constant build_vector + // is lowered into a vector multiply (vpmullw). + return LT.first; + + if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry XOPCostTable[] = { + // 128bit shifts take 1cy, but right shifts require negation beforehand. + { ISD::SHL, MVT::v16i8, 1 }, + { ISD::SRL, MVT::v16i8, 2 }, + { ISD::SRA, MVT::v16i8, 2 }, + { ISD::SHL, MVT::v8i16, 1 }, + { ISD::SRL, MVT::v8i16, 2 }, + { ISD::SRA, MVT::v8i16, 2 }, + { ISD::SHL, MVT::v4i32, 1 }, + { ISD::SRL, MVT::v4i32, 2 }, + { ISD::SRA, MVT::v4i32, 2 }, + { ISD::SHL, MVT::v2i64, 1 }, + { ISD::SRL, MVT::v2i64, 2 }, + { ISD::SRA, MVT::v2i64, 2 }, + // 256bit shifts require splitting if AVX2 didn't catch them above. + { ISD::SHL, MVT::v32i8, 2 }, + { ISD::SRL, MVT::v32i8, 4 }, + { ISD::SRA, MVT::v32i8, 4 }, + { ISD::SHL, MVT::v16i16, 2 }, + { ISD::SRL, MVT::v16i16, 4 }, + { ISD::SRA, MVT::v16i16, 4 }, + { ISD::SHL, MVT::v8i32, 2 }, + { ISD::SRL, MVT::v8i32, 4 }, + { ISD::SRA, MVT::v8i32, 4 }, + { ISD::SHL, MVT::v4i64, 2 }, + { ISD::SRL, MVT::v4i64, 4 }, + { ISD::SRA, MVT::v4i64, 4 }, + }; + + // Look for XOP lowering tricks. + if (ST->hasXOP()) { + if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry AVX2CustomCostTable[] = { + { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. + { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. + + { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence. + { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. + + { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. + { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. + { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence. + { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence. + + // Vectorizing division is a bad idea. See the SSE2 table for more comments. + { ISD::SDIV, MVT::v32i8, 32*20 }, + { ISD::SDIV, MVT::v16i16, 16*20 }, + { ISD::SDIV, MVT::v8i32, 8*20 }, + { ISD::SDIV, MVT::v4i64, 4*20 }, + { ISD::UDIV, MVT::v32i8, 32*20 }, + { ISD::UDIV, MVT::v16i16, 16*20 }, + { ISD::UDIV, MVT::v8i32, 8*20 }, + { ISD::UDIV, MVT::v4i64, 4*20 }, + }; + + // Look for AVX2 lowering tricks for custom cases. + if (ST->hasAVX2()) { + if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry + SSE2UniformConstCostTable[] = { + // We don't correctly identify costs of casts because they are marked as + // custom. + // Constant splats are cheaper for the following instructions. + { ISD::SHL, MVT::v16i8, 1 }, // psllw. + { ISD::SHL, MVT::v32i8, 2 }, // psllw. + { ISD::SHL, MVT::v8i16, 1 }, // psllw. + { ISD::SHL, MVT::v16i16, 2 }, // psllw. + { ISD::SHL, MVT::v4i32, 1 }, // pslld + { ISD::SHL, MVT::v8i32, 2 }, // pslld + { ISD::SHL, MVT::v2i64, 1 }, // psllq. + { ISD::SHL, MVT::v4i64, 2 }, // psllq. + + { ISD::SRL, MVT::v16i8, 1 }, // psrlw. + { ISD::SRL, MVT::v32i8, 2 }, // psrlw. + { ISD::SRL, MVT::v8i16, 1 }, // psrlw. + { ISD::SRL, MVT::v16i16, 2 }, // psrlw. + { ISD::SRL, MVT::v4i32, 1 }, // psrld. + { ISD::SRL, MVT::v8i32, 2 }, // psrld. + { ISD::SRL, MVT::v2i64, 1 }, // psrlq. + { ISD::SRL, MVT::v4i64, 2 }, // psrlq. + + { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. + { ISD::SRA, MVT::v32i8, 8 }, // psrlw, pand, pxor, psubb. + { ISD::SRA, MVT::v8i16, 1 }, // psraw. + { ISD::SRA, MVT::v16i16, 2 }, // psraw. + { ISD::SRA, MVT::v4i32, 1 }, // psrad. + { ISD::SRA, MVT::v8i32, 2 }, // psrad. + { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle. + { ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle. + + { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence + { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence + { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence + { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence + }; + + if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && + ST->hasSSE2()) { + // pmuldq sequence. + if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) + return LT.first * 15; + + if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; + } + + if (ISD == ISD::SHL && + Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { + MVT VT = LT.second; + // Vector shift left by non uniform constant can be lowered + // into vector multiply (pmullw/pmulld). + if ((VT == MVT::v8i16 && ST->hasSSE2()) || + (VT == MVT::v4i32 && ST->hasSSE41())) + return LT.first; + + // v16i16 and v8i32 shifts by non-uniform constants are lowered into a + // sequence of extract + two vector multiply + insert. + if ((VT == MVT::v8i32 || VT == MVT::v16i16) && + (ST->hasAVX() && !ST->hasAVX2())) + ISD = ISD::MUL; + + // A vector shift left by non uniform constant is converted + // into a vector multiply; the new multiply is eventually + // lowered into a sequence of shuffles and 2 x pmuludq. + if (VT == MVT::v4i32 && ST->hasSSE2()) + ISD = ISD::MUL; + } + + static const CostTblEntry SSE2CostTable[] = { + // We don't correctly identify costs of casts because they are marked as + // custom. + // For some cases, where the shift amount is a scalar we would be able + // to generate better code. Unfortunately, when this is the case the value + // (the splat) will get hoisted out of the loop, thereby making it invisible + // to ISel. The cost model must return worst case assumptions because it is + // used for vectorization and we don't want to make vectorized code worse + // than scalar code. + { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. + { ISD::SHL, MVT::v32i8, 2*26 }, // cmpgtb sequence. + { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SHL, MVT::v16i16, 2*32 }, // cmpgtb sequence. + { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. + { ISD::SHL, MVT::v8i32, 2*2*5 }, // We optimized this using mul. + { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. + { ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence. + + { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. + { ISD::SRL, MVT::v32i8, 2*26 }, // cmpgtb sequence. + { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SRL, MVT::v16i16, 2*32 }, // cmpgtb sequence. + { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. + { ISD::SRL, MVT::v8i32, 2*16 }, // Shift each lane + blend. + { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. + { ISD::SRL, MVT::v4i64, 2*4 }, // splat+shuffle sequence. + + { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. + { ISD::SRA, MVT::v32i8, 2*54 }, // unpacked cmpgtb sequence. + { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SRA, MVT::v16i16, 2*32 }, // cmpgtb sequence. + { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. + { ISD::SRA, MVT::v8i32, 2*16 }, // Shift each lane + blend. + { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. + { ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence. + + // It is not a good idea to vectorize division. We have to scalarize it and + // in the process we will often end up having to spilling regular + // registers. The overhead of division is going to dominate most kernels + // anyways so try hard to prevent vectorization of division - it is + // generally a bad idea. Assume somewhat arbitrarily that we have to be able + // to hide "20 cycles" for each lane. + { ISD::SDIV, MVT::v16i8, 16*20 }, + { ISD::SDIV, MVT::v8i16, 8*20 }, + { ISD::SDIV, MVT::v4i32, 4*20 }, + { ISD::SDIV, MVT::v2i64, 2*20 }, + { ISD::UDIV, MVT::v16i8, 16*20 }, + { ISD::UDIV, MVT::v8i16, 8*20 }, + { ISD::UDIV, MVT::v4i32, 4*20 }, + { ISD::UDIV, MVT::v2i64, 2*20 }, + }; + + if (ST->hasSSE2()) { + if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry AVX1CostTable[] = { + // We don't have to scalarize unsupported ops. We can issue two half-sized + // operations and we only need to extract the upper YMM half. + // Two ops + 1 extract + 1 insert = 4. + { ISD::MUL, MVT::v16i16, 4 }, + { ISD::MUL, MVT::v8i32, 4 }, + { ISD::SUB, MVT::v8i32, 4 }, + { ISD::ADD, MVT::v8i32, 4 }, + { ISD::SUB, MVT::v4i64, 4 }, + { ISD::ADD, MVT::v4i64, 4 }, + // A v4i64 multiply is custom lowered as two split v2i64 vectors that then + // are lowered as a series of long multiplies(3), shifts(4) and adds(2) + // Because we believe v4i64 to be a legal type, we must also include the + // split factor of two in the cost table. Therefore, the cost here is 18 + // instead of 9. + { ISD::MUL, MVT::v4i64, 18 }, + }; + + // Look for AVX1 lowering tricks. + if (ST->hasAVX() && !ST->hasAVX2()) { + MVT VT = LT.second; + + if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, VT)) + return LT.first * Entry->Cost; + } + + // Custom lowering of vectors. + static const CostTblEntry CustomLowered[] = { + // A v2i64/v4i64 and multiply is custom lowered as a series of long + // multiplies(3), shifts(4) and adds(2). + { ISD::MUL, MVT::v2i64, 9 }, + { ISD::MUL, MVT::v4i64, 9 }, + }; + if (const auto *Entry = CostTableLookup(CustomLowered, ISD, LT.second)) + return LT.first * Entry->Cost; + + // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle, + // 2x pmuludq, 2x shuffle. + if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() && + !ST->hasSSE41()) + return LT.first * 6; + + // Fallback to the default implementation. + return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); +} + +int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) { + // We only estimate the cost of reverse and alternate shuffles. + if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate) + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + + if (Kind == TTI::SK_Reverse) { + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + int Cost = 1; + if (LT.second.getSizeInBits() > 128) + Cost = 3; // Extract + insert + copy. + + // Multiple by the number of parts. + return Cost * LT.first; + } + + if (Kind == TTI::SK_Alternate) { + // 64-bit packed float vectors (v2f32) are widened to type v4f32. + // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + + // The backend knows how to generate a single VEX.256 version of + // instruction VPBLENDW if the target supports AVX2. + if (ST->hasAVX2() && LT.second == MVT::v16i16) + return LT.first; + + static const CostTblEntry AVXAltShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vblendpd + {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vblendpd + + {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vblendps + {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vblendps + + // This shuffle is custom lowered into a sequence of: + // 2x vextractf128 , 2x vpblendw , 1x vinsertf128 + {ISD::VECTOR_SHUFFLE, MVT::v16i16, 5}, + + // This shuffle is custom lowered into a long sequence of: + // 2x vextractf128 , 4x vpshufb , 2x vpor , 1x vinsertf128 + {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9} + }; + + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVXAltShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry SSE41AltShuffleTbl[] = { + // These are lowered into movsd. + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, + + // packed float vectors with four elements are lowered into BLENDI dag + // nodes. A v4i32/v4f32 BLENDI generates a single 'blendps'/'blendpd'. + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, + + // This shuffle generates a single pshufw. + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, + + // There is no instruction that matches a v16i8 alternate shuffle. + // The backend will expand it into the sequence 'pshufb + pshufb + or'. + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} + }; + + if (ST->hasSSE41()) + if (const auto *Entry = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, + LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry SSSE3AltShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd + + // SSE3 doesn't have 'blendps'. The following shuffles are expanded into + // the sequence 'shufps + pshufd' + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, + + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or + }; + + if (ST->hasSSSE3()) + if (const auto *Entry = CostTableLookup(SSSE3AltShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry SSEAltShuffleTbl[] = { + {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd + {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd + + {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd + {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd + + // This is expanded into a long sequence of four extract + four insert. + {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw. + + // 8 x (pinsrw + pextrw + and + movb + movzb + or) + {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48} + }; + + // Fall-back (SSE3 and SSE2). + if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + } + + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); +} + +int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + // FIXME: Need a better design of the cost table to handle non-simple types of + // potential massive combinations (elem_num x src_type x dst_type). + + static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, + + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, + { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, + { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 }, + { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, + { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, + }; + + static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { + { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, + { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, + { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, + + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 }, + { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 }, + { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, + + // v16i1 -> v16i32 - load + broadcast + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, + + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, + + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, + + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 12 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 }, + + { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, + { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, + }; + + static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, + + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, + { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, + { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 }, + + { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, + { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, + + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, + }; + + static const TypeConversionCostTblEntry AVXConversionTbl[] = { + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, + + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 }, + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 }, + { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 }, + { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 }, + + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, + + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, + // The generic code to compute the scalar overhead is currently broken. + // Workaround this limitation by estimating the scalarization overhead + // here. We have roughly 10 instructions per scalar element. + // Multiply that by the vector width. + // FIXME: remove that when PR19268 is fixed. + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 4*10 }, + + { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 }, + { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, + // This node is expanded into scalarized operations but BasicTTI is overly + // optimistic estimating its cost. It computes 3 per element (one + // vector-extract, one scalar conversion and one vector-insert). The + // problem is that the inserts form a read-modify-write chain so latency + // should be factored in too. Inflating the cost per element by 1. + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, + }; + + static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 }, + + { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 }, + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 30 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, + }; + + static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { + // These are somewhat magic numbers justified by looking at the output of + // Intel's IACA, running some kernels and making sure when we take + // legalization into account the throughput will be overestimated. + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, + // There are faster sequences for float conversions. + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, + + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, + + { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 }, + }; + + std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); + std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst); + + if (ST->hasSSE2() && !ST->hasAVX()) { + if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, + LTDest.second, LTSrc.second)) + return LTSrc.first * Entry->Cost; + } + + EVT SrcTy = TLI->getValueType(DL, Src); + EVT DstTy = TLI->getValueType(DL, Dst); + + // The function getSimpleVT only handles simple value types. + if (!SrcTy.isSimple() || !DstTy.isSimple()) + return BaseT::getCastInstrCost(Opcode, Dst, Src); + + if (ST->hasDQI()) + if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + + if (ST->hasAVX512()) + if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + + if (ST->hasAVX2()) { + if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + } + + if (ST->hasAVX()) { + if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + } + + if (ST->hasSSE41()) { + if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + } + + if (ST->hasSSE2()) { + if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + } + + return BaseT::getCastInstrCost(Opcode, Dst, Src); +} + +int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { + // Legalize the type. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + + MVT MTy = LT.second; + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + static const CostTblEntry SSE42CostTbl[] = { + { ISD::SETCC, MVT::v2f64, 1 }, + { ISD::SETCC, MVT::v4f32, 1 }, + { ISD::SETCC, MVT::v2i64, 1 }, + { ISD::SETCC, MVT::v4i32, 1 }, + { ISD::SETCC, MVT::v8i16, 1 }, + { ISD::SETCC, MVT::v16i8, 1 }, + }; + + static const CostTblEntry AVX1CostTbl[] = { + { ISD::SETCC, MVT::v4f64, 1 }, + { ISD::SETCC, MVT::v8f32, 1 }, + // AVX1 does not support 8-wide integer compare. + { ISD::SETCC, MVT::v4i64, 4 }, + { ISD::SETCC, MVT::v8i32, 4 }, + { ISD::SETCC, MVT::v16i16, 4 }, + { ISD::SETCC, MVT::v32i8, 4 }, + }; + + static const CostTblEntry AVX2CostTbl[] = { + { ISD::SETCC, MVT::v4i64, 1 }, + { ISD::SETCC, MVT::v8i32, 1 }, + { ISD::SETCC, MVT::v16i16, 1 }, + { ISD::SETCC, MVT::v32i8, 1 }, + }; + + static const CostTblEntry AVX512CostTbl[] = { + { ISD::SETCC, MVT::v8i64, 1 }, + { ISD::SETCC, MVT::v16i32, 1 }, + { ISD::SETCC, MVT::v8f64, 1 }, + { ISD::SETCC, MVT::v16f32, 1 }, + }; + + if (ST->hasAVX512()) + if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); +} + +int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { + assert(Val->isVectorTy() && "This must be a vector type"); + + if (Index != -1U) { + // Legalize the type. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); + + // This type is legalized to a scalar type. + if (!LT.second.isVector()) + return 0; + + // The type may be split. Normalize the index to the new type. + unsigned Width = LT.second.getVectorNumElements(); + Index = Index % Width; + + // Floating point scalars are already located in index #0. + if (Val->getScalarType()->isFloatingPointTy() && Index == 0) + return 0; + } + + return BaseT::getVectorInstrCost(Opcode, Val, Index); +} + +int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { + assert (Ty->isVectorTy() && "Can only scalarize vectors"); + int Cost = 0; + + for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { + if (Insert) + Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i); + if (Extract) + Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i); + } + + return Cost; +} + +int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) { + // Handle non-power-of-two vectors such as <3 x float> + if (VectorType *VTy = dyn_cast<VectorType>(Src)) { + unsigned NumElem = VTy->getVectorNumElements(); + + // Handle a few common cases: + // <3 x float> + if (NumElem == 3 && VTy->getScalarSizeInBits() == 32) + // Cost = 64 bit store + extract + 32 bit store. + return 3; + + // <3 x double> + if (NumElem == 3 && VTy->getScalarSizeInBits() == 64) + // Cost = 128 bit store + unpack + 64 bit store. + return 3; + + // Assume that all other non-power-of-two numbers are scalarized. + if (!isPowerOf2_32(NumElem)) { + int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment, + AddressSpace); + int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load, + Opcode == Instruction::Store); + return NumElem * Cost + SplitCost; + } + } + + // Legalize the type. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); + assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && + "Invalid Opcode"); + + // Each load/store unit costs 1. + int Cost = LT.first * 1; + + // On Sandybridge 256bit load/stores are double pumped + // (but not on Haswell). + if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2()) + Cost*=2; + + return Cost; +} + +int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, + unsigned Alignment, + unsigned AddressSpace) { + VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy); + if (!SrcVTy) + // To calculate scalar take the regular cost, without mask + return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace); + + unsigned NumElem = SrcVTy->getVectorNumElements(); + VectorType *MaskTy = + VectorType::get(Type::getInt8Ty(getGlobalContext()), NumElem); + if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) || + (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) || + !isPowerOf2_32(NumElem)) { + // Scalarization + int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); + int ScalarCompareCost = getCmpSelInstrCost( + Instruction::ICmp, Type::getInt8Ty(getGlobalContext()), nullptr); + int BranchCost = getCFInstrCost(Instruction::Br); + int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); + + int ValueSplitCost = getScalarizationOverhead( + SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store); + int MemopCost = + NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), + Alignment, AddressSpace); + return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; + } + + // Legalize the type. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy); + auto VT = TLI->getValueType(DL, SrcVTy); + int Cost = 0; + if (VT.isSimple() && LT.second != VT.getSimpleVT() && + LT.second.getVectorNumElements() == NumElem) + // Promotion requires expand/truncate for data and a shuffle for mask. + Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) + + getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr); + + else if (LT.second.getVectorNumElements() > NumElem) { + VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(), + LT.second.getVectorNumElements()); + // Expanding requires fill mask with zeroes + Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy); + } + if (!ST->hasAVX512()) + return Cost + LT.first*4; // Each maskmov costs 4 + + // AVX-512 masked load/store is cheapper + return Cost+LT.first; +} + +int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { + // Address computations in vectorized code with non-consecutive addresses will + // likely result in more instructions compared to scalar code where the + // computation can more often be merged into the index mode. The resulting + // extra micro-ops can significantly decrease throughput. + unsigned NumVectorInstToHideOverhead = 10; + + if (Ty->isVectorTy() && IsComplex) + return NumVectorInstToHideOverhead; + + return BaseT::getAddressComputationCost(Ty, IsComplex); +} + +int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, + bool IsPairwise) { + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + + MVT MTy = LT.second; + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput + // and make it as the cost. + + static const CostTblEntry SSE42CostTblPairWise[] = { + { ISD::FADD, MVT::v2f64, 2 }, + { ISD::FADD, MVT::v4f32, 4 }, + { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". + { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". + { ISD::ADD, MVT::v8i16, 5 }, + }; + + static const CostTblEntry AVX1CostTblPairWise[] = { + { ISD::FADD, MVT::v4f32, 4 }, + { ISD::FADD, MVT::v4f64, 5 }, + { ISD::FADD, MVT::v8f32, 7 }, + { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". + { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". + { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8". + { ISD::ADD, MVT::v8i16, 5 }, + { ISD::ADD, MVT::v8i32, 5 }, + }; + + static const CostTblEntry SSE42CostTblNoPairWise[] = { + { ISD::FADD, MVT::v2f64, 2 }, + { ISD::FADD, MVT::v4f32, 4 }, + { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". + { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". + { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". + }; + + static const CostTblEntry AVX1CostTblNoPairWise[] = { + { ISD::FADD, MVT::v4f32, 3 }, + { ISD::FADD, MVT::v4f64, 3 }, + { ISD::FADD, MVT::v8f32, 4 }, + { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". + { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8". + { ISD::ADD, MVT::v4i64, 3 }, + { ISD::ADD, MVT::v8i16, 4 }, + { ISD::ADD, MVT::v8i32, 5 }, + }; + + if (IsPairwise) { + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + } else { + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + } + + return BaseT::getReductionCost(Opcode, ValTy, IsPairwise); +} + +/// \brief Calculate the cost of materializing a 64-bit value. This helper +/// method might only calculate a fraction of a larger immediate. Therefore it +/// is valid to return a cost of ZERO. +int X86TTIImpl::getIntImmCost(int64_t Val) { + if (Val == 0) + return TTI::TCC_Free; + + if (isInt<32>(Val)) + return TTI::TCC_Basic; + + return 2 * TTI::TCC_Basic; +} + +int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + if (BitSize == 0) + return ~0U; + + // Never hoist constants larger than 128bit, because this might lead to + // incorrect code generation or assertions in codegen. + // Fixme: Create a cost model for types larger than i128 once the codegen + // issues have been fixed. + if (BitSize > 128) + return TTI::TCC_Free; + + if (Imm == 0) + return TTI::TCC_Free; + + // Sign-extend all constants to a multiple of 64-bit. + APInt ImmVal = Imm; + if (BitSize & 0x3f) + ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); + + // Split the constant into 64-bit chunks and calculate the cost for each + // chunk. + int Cost = 0; + for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { + APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); + int64_t Val = Tmp.getSExtValue(); + Cost += getIntImmCost(Val); + } + // We need at least one instruction to materialze the constant. + return std::max(1, Cost); +} + +int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty) { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + // There is no cost model for constants with a bit size of 0. Return TCC_Free + // here, so that constant hoisting will ignore this constant. + if (BitSize == 0) + return TTI::TCC_Free; + + unsigned ImmIdx = ~0U; + switch (Opcode) { + default: + return TTI::TCC_Free; + case Instruction::GetElementPtr: + // Always hoist the base address of a GetElementPtr. This prevents the + // creation of new constants for every base constant that gets constant + // folded with the offset. + if (Idx == 0) + return 2 * TTI::TCC_Basic; + return TTI::TCC_Free; + case Instruction::Store: + ImmIdx = 0; + break; + case Instruction::ICmp: + // This is an imperfect hack to prevent constant hoisting of + // compares that might be trying to check if a 64-bit value fits in + // 32-bits. The backend can optimize these cases using a right shift by 32. + // Ideally we would check the compare predicate here. There also other + // similar immediates the backend can use shifts for. + if (Idx == 1 && Imm.getBitWidth() == 64) { + uint64_t ImmVal = Imm.getZExtValue(); + if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) + return TTI::TCC_Free; + } + ImmIdx = 1; + break; + case Instruction::And: + // We support 64-bit ANDs with immediates with 32-bits of leading zeroes + // by using a 32-bit operation with implicit zero extension. Detect such + // immediates here as the normal path expects bit 31 to be sign extended. + if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue())) + return TTI::TCC_Free; + // Fallthrough + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::Or: + case Instruction::Xor: + ImmIdx = 1; + break; + // Always return TCC_Free for the shift value of a shift instruction. + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + if (Idx == 1) + return TTI::TCC_Free; + break; + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::IntToPtr: + case Instruction::PtrToInt: + case Instruction::BitCast: + case Instruction::PHI: + case Instruction::Call: + case Instruction::Select: + case Instruction::Ret: + case Instruction::Load: + break; + } + + if (Idx == ImmIdx) { + int NumConstants = (BitSize + 63) / 64; + int Cost = X86TTIImpl::getIntImmCost(Imm, Ty); + return (Cost <= NumConstants * TTI::TCC_Basic) + ? static_cast<int>(TTI::TCC_Free) + : Cost; + } + + return X86TTIImpl::getIntImmCost(Imm, Ty); +} + +int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty) { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + // There is no cost model for constants with a bit size of 0. Return TCC_Free + // here, so that constant hoisting will ignore this constant. + if (BitSize == 0) + return TTI::TCC_Free; + + switch (IID) { + default: + return TTI::TCC_Free; + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: + if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) + return TTI::TCC_Free; + break; + case Intrinsic::experimental_stackmap: + if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) + return TTI::TCC_Free; + break; + case Intrinsic::experimental_patchpoint_void: + case Intrinsic::experimental_patchpoint_i64: + if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) + return TTI::TCC_Free; + break; + } + return X86TTIImpl::getIntImmCost(Imm, Ty); +} + +// Return an average cost of Gather / Scatter instruction, maybe improved later +int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, + unsigned Alignment, unsigned AddressSpace) { + + assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost"); + unsigned VF = SrcVTy->getVectorNumElements(); + + // Try to reduce index size from 64 bit (default for GEP) + // to 32. It is essential for VF 16. If the index can't be reduced to 32, the + // operation will use 16 x 64 indices which do not fit in a zmm and needs + // to split. Also check that the base pointer is the same for all lanes, + // and that there's at most one variable index. + auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) { + unsigned IndexSize = DL.getPointerSizeInBits(); + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); + if (IndexSize < 64 || !GEP) + return IndexSize; + + unsigned NumOfVarIndices = 0; + Value *Ptrs = GEP->getPointerOperand(); + if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) + return IndexSize; + for (unsigned i = 1; i < GEP->getNumOperands(); ++i) { + if (isa<Constant>(GEP->getOperand(i))) + continue; + Type *IndxTy = GEP->getOperand(i)->getType(); + if (IndxTy->isVectorTy()) + IndxTy = IndxTy->getVectorElementType(); + if ((IndxTy->getPrimitiveSizeInBits() == 64 && + !isa<SExtInst>(GEP->getOperand(i))) || + ++NumOfVarIndices > 1) + return IndexSize; // 64 + } + return (unsigned)32; + }; + + + // Trying to reduce IndexSize to 32 bits for vector 16. + // By default the IndexSize is equal to pointer size. + unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) : + DL.getPointerSizeInBits(); + + Type *IndexVTy = VectorType::get(IntegerType::get(getGlobalContext(), + IndexSize), VF); + std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy); + std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy); + int SplitFactor = std::max(IdxsLT.first, SrcLT.first); + if (SplitFactor > 1) { + // Handle splitting of vector of pointers + Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); + return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, + AddressSpace); + } + + // The gather / scatter cost is given by Intel architects. It is a rough + // number since we are looking at one instruction in a time. + const int GSOverhead = 2; + return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), + Alignment, AddressSpace); +} + +/// Return the cost of full scalarization of gather / scatter operation. +/// +/// Opcode - Load or Store instruction. +/// SrcVTy - The type of the data vector that should be gathered or scattered. +/// VariableMask - The mask is non-constant at compile time. +/// Alignment - Alignment for one element. +/// AddressSpace - pointer[s] address space. +/// +int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, + bool VariableMask, unsigned Alignment, + unsigned AddressSpace) { + unsigned VF = SrcVTy->getVectorNumElements(); + + int MaskUnpackCost = 0; + if (VariableMask) { + VectorType *MaskTy = + VectorType::get(Type::getInt1Ty(getGlobalContext()), VF); + MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true); + int ScalarCompareCost = + getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(getGlobalContext()), + nullptr); + int BranchCost = getCFInstrCost(Instruction::Br); + MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); + } + + // The cost of the scalar loads/stores. + int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), + Alignment, AddressSpace); + + int InsertExtractCost = 0; + if (Opcode == Instruction::Load) + for (unsigned i = 0; i < VF; ++i) + // Add the cost of inserting each scalar load into the vector + InsertExtractCost += + getVectorInstrCost(Instruction::InsertElement, SrcVTy, i); + else + for (unsigned i = 0; i < VF; ++i) + // Add the cost of extracting each element out of the data vector + InsertExtractCost += + getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i); + + return MemoryOpCost + MaskUnpackCost + InsertExtractCost; +} + +/// Calculate the cost of Gather / Scatter operation +int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, + Value *Ptr, bool VariableMask, + unsigned Alignment) { + assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); + unsigned VF = SrcVTy->getVectorNumElements(); + PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); + if (!PtrTy && Ptr->getType()->isVectorTy()) + PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType()); + assert(PtrTy && "Unexpected type for Ptr argument"); + unsigned AddressSpace = PtrTy->getAddressSpace(); + + bool Scalarize = false; + if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) || + (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy))) + Scalarize = true; + // Gather / Scatter for vector 2 is not profitable on KNL / SKX + // Vector-4 of gather/scatter instruction does not exist on KNL. + // We can extend it to 8 elements, but zeroing upper bits of + // the mask vector will add more instructions. Right now we give the scalar + // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is + // better in the VariableMask case. + if (VF == 2 || (VF == 4 && !ST->hasVLX())) + Scalarize = true; + + if (Scalarize) + return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace); + + return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); +} + +bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { + Type *ScalarTy = DataTy->getScalarType(); + int DataWidth = isa<PointerType>(ScalarTy) ? + DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); + + return (DataWidth >= 32 && ST->hasAVX2()); +} + +bool X86TTIImpl::isLegalMaskedStore(Type *DataType) { + return isLegalMaskedLoad(DataType); +} + +bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { + // This function is called now in two cases: from the Loop Vectorizer + // and from the Scalarizer. + // When the Loop Vectorizer asks about legality of the feature, + // the vectorization factor is not calculated yet. The Loop Vectorizer + // sends a scalar type and the decision is based on the width of the + // scalar element. + // Later on, the cost model will estimate usage this intrinsic based on + // the vector type. + // The Scalarizer asks again about legality. It sends a vector type. + // In this case we can reject non-power-of-2 vectors. + if (isa<VectorType>(DataTy) && !isPowerOf2_32(DataTy->getVectorNumElements())) + return false; + Type *ScalarTy = DataTy->getScalarType(); + int DataWidth = isa<PointerType>(ScalarTy) ? + DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); + + // AVX-512 allows gather and scatter + return DataWidth >= 32 && ST->hasAVX512(); +} + +bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { + return isLegalMaskedGather(DataType); +} + +bool X86TTIImpl::areInlineCompatible(const Function *Caller, + const Function *Callee) const { + const TargetMachine &TM = getTLI()->getTargetMachine(); + + // Work this as a subsetting of subtarget features. + const FeatureBitset &CallerBits = + TM.getSubtargetImpl(*Caller)->getFeatureBits(); + const FeatureBitset &CalleeBits = + TM.getSubtargetImpl(*Callee)->getFeatureBits(); + + // FIXME: This is likely too limiting as it will include subtarget features + // that we might not care about for inlining, but it is conservatively + // correct. + return (CallerBits & CalleeBits) == CalleeBits; +} diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h new file mode 100644 index 0000000..adb745e --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -0,0 +1,109 @@ +//===-- X86TargetTransformInfo.h - X86 specific TTI -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file a TargetTransformInfo::Concept conforming object specific to the +/// X86 target machine. It uses the target's detailed information to +/// provide more precise answers to certain TTI queries, while letting the +/// target independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H +#define LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H + +#include "X86.h" +#include "X86TargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + +class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> { + typedef BasicTTIImplBase<X86TTIImpl> BaseT; + typedef TargetTransformInfo TTI; + friend BaseT; + + const X86Subtarget *ST; + const X86TargetLowering *TLI; + + int getScalarizationOverhead(Type *Ty, bool Insert, bool Extract); + + const X86Subtarget *getST() const { return ST; } + const X86TargetLowering *getTLI() const { return TLI; } + +public: + explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F) + : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), + TLI(ST->getTargetLowering()) {} + + // Provide value semantics. MSVC requires that we spell all of these out. + X86TTIImpl(const X86TTIImpl &Arg) + : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} + X86TTIImpl(X86TTIImpl &&Arg) + : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), + TLI(std::move(Arg.TLI)) {} + + /// \name Scalar TTI Implementations + /// @{ + TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); + + /// @} + + /// \name Vector TTI Implementations + /// @{ + + unsigned getNumberOfRegisters(bool Vector); + unsigned getRegisterBitWidth(bool Vector); + unsigned getMaxInterleaveFactor(unsigned VF); + int getArithmeticInstrCost( + unsigned Opcode, Type *Ty, + TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, + TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, + TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, + bool VariableMask, unsigned Alignment); + int getAddressComputationCost(Type *PtrTy, bool IsComplex); + + int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); + + int getIntImmCost(int64_t); + + int getIntImmCost(const APInt &Imm, Type *Ty); + + int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); + int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty); + bool isLegalMaskedLoad(Type *DataType); + bool isLegalMaskedStore(Type *DataType); + bool isLegalMaskedGather(Type *DataType); + bool isLegalMaskedScatter(Type *DataType); + bool areInlineCompatible(const Function *Caller, + const Function *Callee) const; +private: + int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, + unsigned Alignment, unsigned AddressSpace); + int getGSVectorCost(unsigned Opcode, Type *DataTy, Value *Ptr, + unsigned Alignment, unsigned AddressSpace); + + /// @} +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp new file mode 100644 index 0000000..6925b27 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -0,0 +1,320 @@ +//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass which inserts x86 AVX vzeroupper instructions +// before calls to SSE encoded functions. This avoids transition latency +// penalty when transferring control between AVX encoded instructions and old +// SSE encoding mode. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +using namespace llvm; + +#define DEBUG_TYPE "x86-vzeroupper" + +STATISTIC(NumVZU, "Number of vzeroupper instructions inserted"); + +namespace { + + class VZeroUpperInserter : public MachineFunctionPass { + public: + + VZeroUpperInserter() : MachineFunctionPass(ID) {} + bool runOnMachineFunction(MachineFunction &MF) override; + const char *getPassName() const override {return "X86 vzeroupper inserter";} + + private: + + void processBasicBlock(MachineBasicBlock &MBB); + void insertVZeroUpper(MachineBasicBlock::iterator I, + MachineBasicBlock &MBB); + void addDirtySuccessor(MachineBasicBlock &MBB); + + typedef enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY } BlockExitState; + static const char* getBlockExitStateName(BlockExitState ST); + + // Core algorithm state: + // BlockState - Each block is either: + // - PASS_THROUGH: There are neither YMM dirtying instructions nor + // vzeroupper instructions in this block. + // - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this + // block that will ensure that YMM is clean on exit. + // - EXITS_DIRTY: An instruction in the block dirties YMM and no + // subsequent vzeroupper in the block clears it. + // + // AddedToDirtySuccessors - This flag is raised when a block is added to the + // DirtySuccessors list to ensure that it's not + // added multiple times. + // + // FirstUnguardedCall - Records the location of the first unguarded call in + // each basic block that may need to be guarded by a + // vzeroupper. We won't know whether it actually needs + // to be guarded until we discover a predecessor that + // is DIRTY_OUT. + struct BlockState { + BlockState() : ExitState(PASS_THROUGH), AddedToDirtySuccessors(false) {} + BlockExitState ExitState; + bool AddedToDirtySuccessors; + MachineBasicBlock::iterator FirstUnguardedCall; + }; + typedef SmallVector<BlockState, 8> BlockStateMap; + typedef SmallVector<MachineBasicBlock*, 8> DirtySuccessorsWorkList; + + BlockStateMap BlockStates; + DirtySuccessorsWorkList DirtySuccessors; + bool EverMadeChange; + const TargetInstrInfo *TII; + + static char ID; + }; + + char VZeroUpperInserter::ID = 0; +} + +FunctionPass *llvm::createX86IssueVZeroUpperPass() { + return new VZeroUpperInserter(); +} + +const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) { + switch (ST) { + case PASS_THROUGH: return "Pass-through"; + case EXITS_DIRTY: return "Exits-dirty"; + case EXITS_CLEAN: return "Exits-clean"; + } + llvm_unreachable("Invalid block exit state."); +} + +static bool isYmmReg(unsigned Reg) { + return (Reg >= X86::YMM0 && Reg <= X86::YMM15); +} + +static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { + for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), + E = MRI.livein_end(); I != E; ++I) + if (isYmmReg(I->first)) + return true; + + return false; +} + +static bool clobbersAllYmmRegs(const MachineOperand &MO) { + for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { + if (!MO.clobbersPhysReg(reg)) + return false; + } + return true; +} + +static bool hasYmmReg(MachineInstr *MI) { + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MI->isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO)) + return true; + if (!MO.isReg()) + continue; + if (MO.isDebug()) + continue; + if (isYmmReg(MO.getReg())) + return true; + } + return false; +} + +/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this +/// instruction. +static bool callClobbersAnyYmmReg(MachineInstr *MI) { + assert(MI->isCall() && "Can only be called on call instructions."); + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isRegMask()) + continue; + for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { + if (MO.clobbersPhysReg(reg)) + return true; + } + } + return false; +} + +// Insert a vzeroupper instruction before I. +void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I, + MachineBasicBlock &MBB) { + DebugLoc dl = I->getDebugLoc(); + BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER)); + ++NumVZU; + EverMadeChange = true; +} + +// Add MBB to the DirtySuccessors list if it hasn't already been added. +void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) { + if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) { + DirtySuccessors.push_back(&MBB); + BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true; + } +} + +/// processBasicBlock - Loop over all of the instructions in the basic block, +/// inserting vzeroupper instructions before function calls. +void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { + + // Start by assuming that the block PASS_THROUGH, which implies no unguarded + // calls. + BlockExitState CurState = PASS_THROUGH; + BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end(); + + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { + MachineInstr *MI = I; + bool isControlFlow = MI->isCall() || MI->isReturn(); + + // Shortcut: don't need to check regular instructions in dirty state. + if (!isControlFlow && CurState == EXITS_DIRTY) + continue; + + if (hasYmmReg(MI)) { + // We found a ymm-using instruction; this could be an AVX instruction, + // or it could be control flow. + CurState = EXITS_DIRTY; + continue; + } + + // Check for control-flow out of the current function (which might + // indirectly execute SSE instructions). + if (!isControlFlow) + continue; + + // If the call won't clobber any YMM register, skip it as well. It usually + // happens on helper function calls (such as '_chkstk', '_ftol2') where + // standard calling convention is not used (RegMask is not used to mark + // register clobbered and register usage (def/imp-def/use) is well-defined + // and explicitly specified. + if (MI->isCall() && !callClobbersAnyYmmReg(MI)) + continue; + + // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX + // registers. This instruction has zero latency. In addition, the processor + // changes back to Clean state, after which execution of Intel SSE + // instructions or Intel AVX instructions has no transition penalty. Add + // the VZEROUPPER instruction before any function call/return that might + // execute SSE code. + // FIXME: In some cases, we may want to move the VZEROUPPER into a + // predecessor block. + if (CurState == EXITS_DIRTY) { + // After the inserted VZEROUPPER the state becomes clean again, but + // other YMM may appear before other subsequent calls or even before + // the end of the BB. + insertVZeroUpper(I, MBB); + CurState = EXITS_CLEAN; + } else if (CurState == PASS_THROUGH) { + // If this block is currently in pass-through state and we encounter a + // call then whether we need a vzeroupper or not depends on whether this + // block has successors that exit dirty. Record the location of the call, + // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet. + // It will be inserted later if necessary. + BlockStates[MBB.getNumber()].FirstUnguardedCall = I; + CurState = EXITS_CLEAN; + } + } + + DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: " + << getBlockExitStateName(CurState) << '\n'); + + if (CurState == EXITS_DIRTY) + for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(), + SE = MBB.succ_end(); + SI != SE; ++SI) + addDirtySuccessor(**SI); + + BlockStates[MBB.getNumber()].ExitState = CurState; +} + +/// runOnMachineFunction - Loop over all of the basic blocks, inserting +/// vzeroupper instructions before function calls. +bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { + const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); + if (!ST.hasAVX() || ST.hasAVX512()) + return false; + TII = ST.getInstrInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + EverMadeChange = false; + + bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI); + + // Fast check: if the function doesn't use any ymm registers, we don't need + // to insert any VZEROUPPER instructions. This is constant-time, so it is + // cheap in the common case of no ymm use. + bool YMMUsed = FnHasLiveInYmm; + if (!YMMUsed) { + const TargetRegisterClass *RC = &X86::VR256RegClass; + for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e; + i++) { + if (!MRI.reg_nodbg_empty(*i)) { + YMMUsed = true; + break; + } + } + } + if (!YMMUsed) { + return false; + } + + assert(BlockStates.empty() && DirtySuccessors.empty() && + "X86VZeroUpper state should be clear"); + BlockStates.resize(MF.getNumBlockIDs()); + + // Process all blocks. This will compute block exit states, record the first + // unguarded call in each block, and add successors of dirty blocks to the + // DirtySuccessors list. + for (MachineBasicBlock &MBB : MF) + processBasicBlock(MBB); + + // If any YMM regs are live in to this function, add the entry block to the + // DirtySuccessors list + if (FnHasLiveInYmm) + addDirtySuccessor(MF.front()); + + // Re-visit all blocks that are successors of EXITS_DIRTY bsocks. Add + // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY + // through PASS_THROUGH blocks. + while (!DirtySuccessors.empty()) { + MachineBasicBlock &MBB = *DirtySuccessors.back(); + DirtySuccessors.pop_back(); + BlockState &BBState = BlockStates[MBB.getNumber()]; + + // MBB is a successor of a dirty block, so its first call needs to be + // guarded. + if (BBState.FirstUnguardedCall != MBB.end()) + insertVZeroUpper(BBState.FirstUnguardedCall, MBB); + + // If this successor was a pass-through block then it is now dirty, and its + // successors need to be added to the worklist (if they haven't been + // already). + if (BBState.ExitState == PASS_THROUGH) { + DEBUG(dbgs() << "MBB #" << MBB.getNumber() + << " was Pass-through, is now Dirty-out.\n"); + for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(), + SE = MBB.succ_end(); + SI != SE; ++SI) + addDirtySuccessor(**SI); + } + } + + BlockStates.clear(); + return EverMadeChange; +} diff --git a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp new file mode 100644 index 0000000..dce94a9 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp @@ -0,0 +1,456 @@ +//===-- X86WinEHState - Insert EH state updates for win32 exceptions ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// All functions using an MSVC EH personality use an explicitly updated state +// number stored in an exception registration stack object. The registration +// object is linked into a thread-local chain of registrations stored at fs:00. +// This pass adds the registration object and EH state updates. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/EHPersonalities.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/WinEHFuncInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +#define DEBUG_TYPE "winehstate" + +namespace llvm { void initializeWinEHStatePassPass(PassRegistry &); } + +namespace { +class WinEHStatePass : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid. + + WinEHStatePass() : FunctionPass(ID) { + initializeWinEHStatePassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &Fn) override; + + bool doInitialization(Module &M) override; + + bool doFinalization(Module &M) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + const char *getPassName() const override { + return "Windows 32-bit x86 EH state insertion"; + } + +private: + void emitExceptionRegistrationRecord(Function *F); + + void linkExceptionRegistration(IRBuilder<> &Builder, Function *Handler); + void unlinkExceptionRegistration(IRBuilder<> &Builder); + void addStateStores(Function &F, WinEHFuncInfo &FuncInfo); + void insertStateNumberStore(Value *ParentRegNode, Instruction *IP, int State); + + Value *emitEHLSDA(IRBuilder<> &Builder, Function *F); + + Function *generateLSDAInEAXThunk(Function *ParentFunc); + + // Module-level type getters. + Type *getEHLinkRegistrationType(); + Type *getSEHRegistrationType(); + Type *getCXXEHRegistrationType(); + + // Per-module data. + Module *TheModule = nullptr; + StructType *EHLinkRegistrationTy = nullptr; + StructType *CXXEHRegistrationTy = nullptr; + StructType *SEHRegistrationTy = nullptr; + Function *FrameRecover = nullptr; + Function *FrameAddress = nullptr; + Function *FrameEscape = nullptr; + + // Per-function state + EHPersonality Personality = EHPersonality::Unknown; + Function *PersonalityFn = nullptr; + + /// The stack allocation containing all EH data, including the link in the + /// fs:00 chain and the current state. + AllocaInst *RegNode = nullptr; + + /// Struct type of RegNode. Used for GEPing. + Type *RegNodeTy = nullptr; + + /// The index of the state field of RegNode. + int StateFieldIndex = ~0U; + + /// The linked list node subobject inside of RegNode. + Value *Link = nullptr; +}; +} + +FunctionPass *llvm::createX86WinEHStatePass() { return new WinEHStatePass(); } + +char WinEHStatePass::ID = 0; + +INITIALIZE_PASS(WinEHStatePass, "x86-winehstate", + "Insert stores for EH state numbers", false, false) + +bool WinEHStatePass::doInitialization(Module &M) { + TheModule = &M; + FrameEscape = Intrinsic::getDeclaration(TheModule, Intrinsic::localescape); + FrameRecover = Intrinsic::getDeclaration(TheModule, Intrinsic::localrecover); + FrameAddress = Intrinsic::getDeclaration(TheModule, Intrinsic::frameaddress); + return false; +} + +bool WinEHStatePass::doFinalization(Module &M) { + assert(TheModule == &M); + TheModule = nullptr; + EHLinkRegistrationTy = nullptr; + CXXEHRegistrationTy = nullptr; + SEHRegistrationTy = nullptr; + FrameEscape = nullptr; + FrameRecover = nullptr; + FrameAddress = nullptr; + return false; +} + +void WinEHStatePass::getAnalysisUsage(AnalysisUsage &AU) const { + // This pass should only insert a stack allocation, memory accesses, and + // localrecovers. + AU.setPreservesCFG(); +} + +bool WinEHStatePass::runOnFunction(Function &F) { + // Check the personality. Do nothing if this personality doesn't use funclets. + if (!F.hasPersonalityFn()) + return false; + PersonalityFn = + dyn_cast<Function>(F.getPersonalityFn()->stripPointerCasts()); + if (!PersonalityFn) + return false; + Personality = classifyEHPersonality(PersonalityFn); + if (!isFuncletEHPersonality(Personality)) + return false; + + // Skip this function if there are no EH pads and we aren't using IR-level + // outlining. + bool HasPads = false; + for (BasicBlock &BB : F) { + if (BB.isEHPad()) { + HasPads = true; + break; + } + } + if (!HasPads) + return false; + + // Disable frame pointer elimination in this function. + // FIXME: Do the nested handlers need to keep the parent ebp in ebp, or can we + // use an arbitrary register? + F.addFnAttr("no-frame-pointer-elim", "true"); + + emitExceptionRegistrationRecord(&F); + + // The state numbers calculated here in IR must agree with what we calculate + // later on for the MachineFunction. In particular, if an IR pass deletes an + // unreachable EH pad after this point before machine CFG construction, we + // will be in trouble. If this assumption is ever broken, we should turn the + // numbers into an immutable analysis pass. + WinEHFuncInfo FuncInfo; + addStateStores(F, FuncInfo); + + // Reset per-function state. + PersonalityFn = nullptr; + Personality = EHPersonality::Unknown; + return true; +} + +/// Get the common EH registration subobject: +/// typedef _EXCEPTION_DISPOSITION (*PEXCEPTION_ROUTINE)( +/// _EXCEPTION_RECORD *, void *, _CONTEXT *, void *); +/// struct EHRegistrationNode { +/// EHRegistrationNode *Next; +/// PEXCEPTION_ROUTINE Handler; +/// }; +Type *WinEHStatePass::getEHLinkRegistrationType() { + if (EHLinkRegistrationTy) + return EHLinkRegistrationTy; + LLVMContext &Context = TheModule->getContext(); + EHLinkRegistrationTy = StructType::create(Context, "EHRegistrationNode"); + Type *FieldTys[] = { + EHLinkRegistrationTy->getPointerTo(0), // EHRegistrationNode *Next + Type::getInt8PtrTy(Context) // EXCEPTION_DISPOSITION (*Handler)(...) + }; + EHLinkRegistrationTy->setBody(FieldTys, false); + return EHLinkRegistrationTy; +} + +/// The __CxxFrameHandler3 registration node: +/// struct CXXExceptionRegistration { +/// void *SavedESP; +/// EHRegistrationNode SubRecord; +/// int32_t TryLevel; +/// }; +Type *WinEHStatePass::getCXXEHRegistrationType() { + if (CXXEHRegistrationTy) + return CXXEHRegistrationTy; + LLVMContext &Context = TheModule->getContext(); + Type *FieldTys[] = { + Type::getInt8PtrTy(Context), // void *SavedESP + getEHLinkRegistrationType(), // EHRegistrationNode SubRecord + Type::getInt32Ty(Context) // int32_t TryLevel + }; + CXXEHRegistrationTy = + StructType::create(FieldTys, "CXXExceptionRegistration"); + return CXXEHRegistrationTy; +} + +/// The _except_handler3/4 registration node: +/// struct EH4ExceptionRegistration { +/// void *SavedESP; +/// _EXCEPTION_POINTERS *ExceptionPointers; +/// EHRegistrationNode SubRecord; +/// int32_t EncodedScopeTable; +/// int32_t TryLevel; +/// }; +Type *WinEHStatePass::getSEHRegistrationType() { + if (SEHRegistrationTy) + return SEHRegistrationTy; + LLVMContext &Context = TheModule->getContext(); + Type *FieldTys[] = { + Type::getInt8PtrTy(Context), // void *SavedESP + Type::getInt8PtrTy(Context), // void *ExceptionPointers + getEHLinkRegistrationType(), // EHRegistrationNode SubRecord + Type::getInt32Ty(Context), // int32_t EncodedScopeTable + Type::getInt32Ty(Context) // int32_t TryLevel + }; + SEHRegistrationTy = StructType::create(FieldTys, "SEHExceptionRegistration"); + return SEHRegistrationTy; +} + +// Emit an exception registration record. These are stack allocations with the +// common subobject of two pointers: the previous registration record (the old +// fs:00) and the personality function for the current frame. The data before +// and after that is personality function specific. +void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { + assert(Personality == EHPersonality::MSVC_CXX || + Personality == EHPersonality::MSVC_X86SEH); + + StringRef PersonalityName = PersonalityFn->getName(); + IRBuilder<> Builder(&F->getEntryBlock(), F->getEntryBlock().begin()); + Type *Int8PtrType = Builder.getInt8PtrTy(); + if (Personality == EHPersonality::MSVC_CXX) { + RegNodeTy = getCXXEHRegistrationType(); + RegNode = Builder.CreateAlloca(RegNodeTy); + // SavedESP = llvm.stacksave() + Value *SP = Builder.CreateCall( + Intrinsic::getDeclaration(TheModule, Intrinsic::stacksave), {}); + Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0)); + // TryLevel = -1 + StateFieldIndex = 2; + insertStateNumberStore(RegNode, &*Builder.GetInsertPoint(), -1); + // Handler = __ehhandler$F + Function *Trampoline = generateLSDAInEAXThunk(F); + Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 1); + linkExceptionRegistration(Builder, Trampoline); + } else if (Personality == EHPersonality::MSVC_X86SEH) { + // If _except_handler4 is in use, some additional guard checks and prologue + // stuff is required. + bool UseStackGuard = (PersonalityName == "_except_handler4"); + RegNodeTy = getSEHRegistrationType(); + RegNode = Builder.CreateAlloca(RegNodeTy); + // SavedESP = llvm.stacksave() + Value *SP = Builder.CreateCall( + Intrinsic::getDeclaration(TheModule, Intrinsic::stacksave), {}); + Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0)); + // TryLevel = -2 / -1 + StateFieldIndex = 4; + insertStateNumberStore(RegNode, &*Builder.GetInsertPoint(), + UseStackGuard ? -2 : -1); + // ScopeTable = llvm.x86.seh.lsda(F) + Value *FI8 = Builder.CreateBitCast(F, Int8PtrType); + Value *LSDA = Builder.CreateCall( + Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_lsda), FI8); + Type *Int32Ty = Type::getInt32Ty(TheModule->getContext()); + LSDA = Builder.CreatePtrToInt(LSDA, Int32Ty); + // If using _except_handler4, xor the address of the table with + // __security_cookie. + if (UseStackGuard) { + Value *Cookie = + TheModule->getOrInsertGlobal("__security_cookie", Int32Ty); + Value *Val = Builder.CreateLoad(Int32Ty, Cookie); + LSDA = Builder.CreateXor(LSDA, Val); + } + Builder.CreateStore(LSDA, Builder.CreateStructGEP(RegNodeTy, RegNode, 3)); + Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 2); + linkExceptionRegistration(Builder, PersonalityFn); + } else { + llvm_unreachable("unexpected personality function"); + } + + // Insert an unlink before all returns. + for (BasicBlock &BB : *F) { + TerminatorInst *T = BB.getTerminator(); + if (!isa<ReturnInst>(T)) + continue; + Builder.SetInsertPoint(T); + unlinkExceptionRegistration(Builder); + } +} + +Value *WinEHStatePass::emitEHLSDA(IRBuilder<> &Builder, Function *F) { + Value *FI8 = Builder.CreateBitCast(F, Type::getInt8PtrTy(F->getContext())); + return Builder.CreateCall( + Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_lsda), FI8); +} + +/// Generate a thunk that puts the LSDA of ParentFunc in EAX and then calls +/// PersonalityFn, forwarding the parameters passed to PEXCEPTION_ROUTINE: +/// typedef _EXCEPTION_DISPOSITION (*PEXCEPTION_ROUTINE)( +/// _EXCEPTION_RECORD *, void *, _CONTEXT *, void *); +/// We essentially want this code: +/// movl $lsda, %eax +/// jmpl ___CxxFrameHandler3 +Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) { + LLVMContext &Context = ParentFunc->getContext(); + Type *Int32Ty = Type::getInt32Ty(Context); + Type *Int8PtrType = Type::getInt8PtrTy(Context); + Type *ArgTys[5] = {Int8PtrType, Int8PtrType, Int8PtrType, Int8PtrType, + Int8PtrType}; + FunctionType *TrampolineTy = + FunctionType::get(Int32Ty, makeArrayRef(&ArgTys[0], 4), + /*isVarArg=*/false); + FunctionType *TargetFuncTy = + FunctionType::get(Int32Ty, makeArrayRef(&ArgTys[0], 5), + /*isVarArg=*/false); + Function *Trampoline = + Function::Create(TrampolineTy, GlobalValue::InternalLinkage, + Twine("__ehhandler$") + GlobalValue::getRealLinkageName( + ParentFunc->getName()), + TheModule); + BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline); + IRBuilder<> Builder(EntryBB); + Value *LSDA = emitEHLSDA(Builder, ParentFunc); + Value *CastPersonality = + Builder.CreateBitCast(PersonalityFn, TargetFuncTy->getPointerTo()); + auto AI = Trampoline->arg_begin(); + Value *Args[5] = {LSDA, &*AI++, &*AI++, &*AI++, &*AI++}; + CallInst *Call = Builder.CreateCall(CastPersonality, Args); + // Can't use musttail due to prototype mismatch, but we can use tail. + Call->setTailCall(true); + // Set inreg so we pass it in EAX. + Call->addAttribute(1, Attribute::InReg); + Builder.CreateRet(Call); + return Trampoline; +} + +void WinEHStatePass::linkExceptionRegistration(IRBuilder<> &Builder, + Function *Handler) { + // Emit the .safeseh directive for this function. + Handler->addFnAttr("safeseh"); + + Type *LinkTy = getEHLinkRegistrationType(); + // Handler = Handler + Value *HandlerI8 = Builder.CreateBitCast(Handler, Builder.getInt8PtrTy()); + Builder.CreateStore(HandlerI8, Builder.CreateStructGEP(LinkTy, Link, 1)); + // Next = [fs:00] + Constant *FSZero = + Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257)); + Value *Next = Builder.CreateLoad(FSZero); + Builder.CreateStore(Next, Builder.CreateStructGEP(LinkTy, Link, 0)); + // [fs:00] = Link + Builder.CreateStore(Link, FSZero); +} + +void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) { + // Clone Link into the current BB for better address mode folding. + if (auto *GEP = dyn_cast<GetElementPtrInst>(Link)) { + GEP = cast<GetElementPtrInst>(GEP->clone()); + Builder.Insert(GEP); + Link = GEP; + } + Type *LinkTy = getEHLinkRegistrationType(); + // [fs:00] = Link->Next + Value *Next = + Builder.CreateLoad(Builder.CreateStructGEP(LinkTy, Link, 0)); + Constant *FSZero = + Constant::getNullValue(LinkTy->getPointerTo()->getPointerTo(257)); + Builder.CreateStore(Next, FSZero); +} + +void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { + // Mark the registration node. The backend needs to know which alloca it is so + // that it can recover the original frame pointer. + IRBuilder<> Builder(RegNode->getParent(), std::next(RegNode->getIterator())); + Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getInt8PtrTy()); + Builder.CreateCall( + Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehregnode), + {RegNodeI8}); + + // Calculate state numbers. + if (isAsynchronousEHPersonality(Personality)) + calculateSEHStateNumbers(&F, FuncInfo); + else + calculateWinCXXEHStateNumbers(&F, FuncInfo); + + // Iterate all the instructions and emit state number stores. + DenseMap<BasicBlock *, ColorVector> BlockColors = colorEHFunclets(F); + for (BasicBlock &BB : F) { + // Figure out what state we should assign calls in this block. + int BaseState = -1; + auto &BBColors = BlockColors[&BB]; + + assert(BBColors.size() == 1 && + "multi-color BB not removed by preparation"); + BasicBlock *FuncletEntryBB = BBColors.front(); + if (auto *FuncletPad = + dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHI())) { + auto BaseStateI = FuncInfo.FuncletBaseStateMap.find(FuncletPad); + if (BaseStateI != FuncInfo.FuncletBaseStateMap.end()) + BaseState = BaseStateI->second; + } + + for (Instruction &I : BB) { + if (auto *CI = dyn_cast<CallInst>(&I)) { + // Possibly throwing call instructions have no actions to take after + // an unwind. Ensure they are in the -1 state. + if (CI->doesNotThrow()) + continue; + insertStateNumberStore(RegNode, CI, BaseState); + } else if (auto *II = dyn_cast<InvokeInst>(&I)) { + // Look up the state number of the landingpad this unwinds to. + assert(FuncInfo.InvokeStateMap.count(II) && "invoke has no state!"); + int State = FuncInfo.InvokeStateMap[II]; + insertStateNumberStore(RegNode, II, State); + } + } + } +} + +void WinEHStatePass::insertStateNumberStore(Value *ParentRegNode, + Instruction *IP, int State) { + IRBuilder<> Builder(IP); + Value *StateField = + Builder.CreateStructGEP(RegNodeTy, ParentRegNode, StateFieldIndex); + Builder.CreateStore(Builder.getInt32(State), StateField); +} |